From e2e7c482e805cd4b8ac07fd1929bc2f9ee20e9a1 Mon Sep 17 00:00:00 2001 From: github-action-benchmark Date: Fri, 7 Jun 2024 07:48:48 +0000 Subject: [PATCH] add smaller_is_better (customSmallerIsBetter) benchmark result for 87571b8be8105738d6da87df053d5a32e7fa001e --- dev/bench/data.js | 33162 ++++++++++++++++++++++---------------------- 1 file changed, 16581 insertions(+), 16581 deletions(-) diff --git a/dev/bench/data.js b/dev/bench/data.js index bc64ed525e6d0..90374865a933a 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,5 +1,5 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1717746519706, + "lastUpdate": 1717746526986, "repoUrl": "https://github.com/neuralmagic/nm-vllm", "entries": { "bigger_is_better": [ @@ -68310,668 +68310,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-04-23T14:46:41Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488" }, - "date": 1714210771537, + "date": 1714297078376, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14671.440407500086, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 66931.1053829997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 208.6643606973424, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 23342.27604626999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 161.46753849898232, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 17800.132582000515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 117.89186615758388, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 261.657417060096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 124.39165958450867, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 278.97989142772093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6387.269492000087, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6064.163899500272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 111.88932599993375, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 121.84322424331792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 72.5244825007394, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 81.91542849999678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.289724544148655, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.81237977005924, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.01978473213134, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.49480608319446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1923.126683501323, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 3666.834318499241, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 79.84467063327126, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 143.11174139328313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.41554399918823, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 91.59698149960604, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.005281552743925, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 24.36734761260045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.09402611059665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 23.649514210886167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2136.2335784997413, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6355.360378500336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136.25811479666177, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 111.07957002678936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 104.31731850076176, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 69.13049949980632, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 15.10960127592157, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.273124083950954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13.650743723656545, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.95699933018845, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18574.00114200027, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6063.323772000331, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1102.113974073327, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 127.74053738003205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 230.7276385017758, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 92.22327899988159, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 162.38298922781706, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.47740103380456, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 148.92584730583258, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.41175559645828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 62434.5530745004, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40341.59530111335, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40089.88794800007, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 99.23333490961022, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 101.38731865845863, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6134.139074499217, + "value": 6138.0041939992225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 106.36771252658946, + "value": 105.25370153986538, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 68.1048379992717, + "value": 69.34573000035016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 34.66033560242653, + "value": 34.66916330861514, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 35.17876881436496, + "value": 35.20249453031695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65613.32472899994, + "value": 67696.56554000176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 28628.61416851538, + "value": 30013.547229894015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 27332.809500499934, + "value": 29051.331959499294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 188.24308156207644, + "value": 189.95778387274478, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 199.30614640370118, + "value": 200.71795366500223, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6062.0934375001525, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 257689.3706384999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 122.8672970640667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 243934.78863560208, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 94.20262949970493, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 244595.2843140003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 55.13576839101936, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 69.76695993422113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 48.08502645026652, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 67.07681470882945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 77644.25305650092, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6068.786497999099, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58056.55871913598, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 121.80484077736523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 71003.32071900084, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 89.96812399891496, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 68.41126599025908, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 55.036998853002125, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65.81709570907654, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 48.161595403597964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66410.8578134992, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2142.5964224999916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23003.429712153353, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 136.95423646659037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 17349.811404999855, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 105.06599900054425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 261.2103598160443, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 15.088356492075098, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 278.55816558679993, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 13.655097184058752, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5222.535258501011, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1923.5307070002818, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 89.90427876022295, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 79.52589756001544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61.31367800117005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.84988749976037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.250890948022324, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 10.990196552600976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 31.679306717872336, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.101862315424263, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7859.990413498963, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 14727.956755001287, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 156.30002720930983, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 207.43928320336272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 127.74674149932252, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 161.8715450003947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58.084073885518116, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 118.73434108304961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.587868500857205, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 125.11435313409407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5386.467451000499, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 5239.008678499886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 182.94118360667682, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 90.91267326681798, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 172.65122749995498, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 61.70038650088827, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.44215306390788, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.28207667565892, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.65400143078365, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 31.687763794898334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12297.73155500061, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1815.862186999766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 198.2033804333405, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 92.85708308672838, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 171.63947199969698, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 59.3996994994086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 96.09748940261052, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12.503594600157154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 101.07422222877577, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.726245442219327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6068.778347000261, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2432.085828500931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 121.58755756997077, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 115.97731625733529, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 81.96738949936844, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 83.34300799924677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.834716322933176, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 18.038354873429853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.42931719131392, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 16.147303457152088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6054.284574499434, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2071.159581500069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 127.06042101664936, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 117.38934138680634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.53861100114591, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 86.24154350036406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.60141152783043, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.614447459022692, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.35647327143947, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.609165830648937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1814.8084434997145, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 18468.7140394999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 93.60767734337666, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1110.133476052722, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.111170000098355, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 236.71987249872473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12.478057171616896, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 162.36971628466682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.78352964388927, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 149.287468383281, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 62489.75667349987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 7876.454225500311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40302.325046974685, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 156.95443960801884, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39308.59702949965, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 128.45143799859216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 99.83318207948984, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 57.94276547439565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 102.22480738465657, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 57.508369824266325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2070.6088160004583, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 78565.39773550048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.26469750008255, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 58711.33112383332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 86.99318649996712, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 72157.52103149952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.632297370645555, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 68.35338342569004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.61245543034712, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 65.91079877096972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2426.99449849988, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 5395.843203001277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.47309425734177, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 183.65219166668734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 80.91874300043855, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 178.78151750119287, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18.093533888949484, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.48714043924613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 16.203817817727327, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.688588111216575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 258293.34289049986, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 243556.67614118863, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 244923.2257779995, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.73269317950292, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66.83747040314358, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 3653.525684001579, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12238.293557499674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 143.22962458001712, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 199.42883660927086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.1650234995468, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 168.41479350023292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24.40717868159896, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 94.97464477132647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23.733869918782684, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 99.14474897894794, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -68992,668 +68992,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-04-23T14:46:41Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488" }, - "date": 1714297078376, + "date": 1714383658525, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66931.1053829997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 5403.3334414980345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23342.27604626999, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 182.6672527786577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 17800.132582000515, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 173.26183499972103, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 261.657417060096, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.570686681999014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 278.97989142772093, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.71272982877713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 06:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6064.163899500272, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 14712.270560000434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 121.84322424331792, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 211.4012420333347, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 81.91542849999678, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 165.1276724996933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.81237977005924, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 118.61316128523288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.49480608319446, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 125.23053596180378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:45:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 3666.834318499241, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12388.316153499545, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 143.11174139328313, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 202.90839387460193, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 91.59698149960604, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 177.19953449977766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24.36734761260045, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 95.76608171097477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23.649514210886167, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 100.6277065136866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:12:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6355.360378500336, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 7894.943926500673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 111.07957002678936, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 157.38730911329912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.13049949980632, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 131.29129100070713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.273124083950954, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 58.145339403127906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.95699933018845, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 57.77174892640247, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:03:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6063.323772000331, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6143.9237764998325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 127.74053738003205, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 105.57952202662516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.22327899988159, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 68.90616300006513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.47740103380456, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 34.740065797105615, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.41175559645828, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 35.27362721178171, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:10:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 62434.5530745004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2431.906657499894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40341.59530111335, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 116.1085367613535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40089.88794800007, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 80.23967450026248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 99.23333490961022, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 18.105119612849144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 101.38731865845863, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 16.194528842399823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:24:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6138.0041939992225, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 78082.65650449993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 105.25370153986538, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 58163.472767429324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.34573000035016, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 71294.46411749996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 34.66916330861514, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 67.35790102715542, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 35.20249453031695, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 65.52902275067585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:35:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67696.56554000176, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 5234.401947000151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 30013.547229894015, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 90.68538420006614, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 29051.331959499294, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 60.826272501799394, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 189.95778387274478, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.25110775777427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 200.71795366500223, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 31.682919228131336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:33:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 257689.3706384999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2065.334467500179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 243934.78863560208, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 117.37230603336987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 244595.2843140003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 86.2206050005625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.76695993422113, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.59097868551506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67.07681470882945, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.618711720487603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:08:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6068.786497999099, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 67333.70489449863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 121.80484077736523, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 23449.42444652001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 89.96812399891496, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 17718.90502399947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 55.036998853002125, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 262.3360423793547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 48.161595403597964, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 279.50231642284837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:49:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2142.5964224999916, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 18693.89975500053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136.95423646659037, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1106.037561508017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 105.06599900054425, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 234.87662999832537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 15.088356492075098, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 162.23267729135304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13.655097184058752, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 149.22055482149233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:47:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1923.5307070002818, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 61705.24036599909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 79.52589756001544, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39382.883654935344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.84988749976037, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 38853.11500399985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 10.990196552600976, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 98.6048637101342, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.101862315424263, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 100.93879064096944, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:15:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14727.956755001287, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6106.1727380001685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 207.43928320336272, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 122.2248624533313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 161.8715450003947, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 82.13618149875401, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 118.73434108304961, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.866295219973715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 125.11435313409407, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.483253903939236, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:34:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5239.008678499886, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1841.4298955003687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 90.91267326681798, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 92.76224013333679, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61.70038650088827, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 59.39676350044465, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.28207667565892, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12.53072772650871, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 31.687763794898334, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.7504803795625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:41:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1815.862186999766, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1928.7328335003622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.85708308672838, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 79.24429737999162, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 59.3996994994086, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 38.83016149939067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12.503594600157154, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.01682881146633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.726245442219327, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.104867589473509, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:21:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2432.085828500931, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6375.923833499655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 115.97731625733529, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 109.70417828670179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 83.34300799924677, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 70.59495649991732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18.038354873429853, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.29284477344846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 16.147303457152088, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.98958335851306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:27:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2071.159581500069, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 260014.52012900062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 117.38934138680634, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 244655.706315512, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 86.24154350036406, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 246336.86162399955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.614447459022692, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 70.77495113818273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.609165830648937, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 66.8261656403469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:41:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18468.7140394999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 3645.2240139988135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1110.133476052722, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 143.44058583311076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 236.71987249872473, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 94.05171199978213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 162.36971628466682, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 24.43651480224753, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 149.287468383281, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 23.768530231824148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:57:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7876.454225500311, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6056.600670500302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 156.95443960801884, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 128.3224846033469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 128.45143799859216, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 87.35010950022115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.94276547439565, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.54763684967656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.508369824266325, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.42104585396235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:16:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 78565.39773550048, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6055.8608940009435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58711.33112383332, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 122.8573354986826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 72157.52103149952, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 92.14041750055912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 68.35338342569004, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 55.07886411780935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65.91079877096972, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 48.0826652607532, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 04:53:24 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5395.843203001277, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 66309.31746399801, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 183.65219166668734, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 29019.913823834024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 178.78151750119287, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 27652.122268498715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.48714043924613, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 188.22594207093397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.688588111216575, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 199.4164384190758, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-28 06:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12238.293557499674, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2122.4544235001304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 199.42883660927086, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 136.2453950600563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 168.41479350023292, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 104.65170849965943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 94.97464477132647, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 15.058391072166453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 99.14474897894794, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 13.626770453407055, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-28 05:54:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -69674,4078 +69674,4078 @@ window.BENCHMARK_DATA = { "timestamp": "2024-04-23T14:46:41Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488" }, - "date": 1714383658525, + "date": 1714470075644, "tool": "customSmallerIsBetter", "benches": [ { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5403.3334414980345, + "value": 5376.062333001755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 182.6672527786577, + "value": 184.5827273187315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 173.26183499972103, + "value": 173.52721449969977, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.570686681999014, + "value": 40.43196687663791, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.71272982877713, + "value": 37.58095798642143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14712.270560000434, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12320.661744000063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 211.4012420333347, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 200.01820418526768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 165.1276724996933, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 169.44416600108525, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 118.61316128523288, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 96.04773220335846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 125.23053596180378, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 101.17233322299779, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:36:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12388.316153499545, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 3652.386217499952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 202.90839387460193, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 142.93804023662233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 177.19953449977766, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 95.80902800007607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 95.76608171097477, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 24.498444220468098, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 100.6277065136866, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 23.784562093061254, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:56:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7894.943926500673, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1823.0353210010435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 157.38730911329912, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 93.06960186000651, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 131.29129100070713, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 59.665249999852676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58.145339403127906, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 12.502380206719828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.77174892640247, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.783106806446016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:19:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6143.9237764998325, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 5238.431358498929, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 105.57952202662516, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 91.44519897992723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 68.90616300006513, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 60.88910249854962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 34.740065797105615, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 37.252881617494005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 35.27362721178171, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 31.70269005355473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:37:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2431.906657499894, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 1929.8616509995554, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.1085367613535, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 79.59311796000596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 80.23967450026248, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.459064500078966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18.105119612849144, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 10.979780574097273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 16.194528842399823, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.09183504894853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:30:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 78082.65650449993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 65573.8574650004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58163.472767429324, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 28950.78831299604, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 71294.46411749996, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 28142.69032300217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67.35790102715542, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 188.86090202522692, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65.52902275067585, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 199.33138919321786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5234.401947000151, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6378.783236000345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 90.68538420006614, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 108.38548640673253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 60.826272501799394, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 71.50023299982422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.25110775777427, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.32068798008349, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 31.682919228131336, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 36.99483870421225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:43:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2065.334467500179, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6047.278685499805, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 117.37230603336987, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 125.4039507166696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 86.2206050005625, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 87.60409249862278, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.59097868551506, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.662599829984984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.618711720487603, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40.43363331252289, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:44:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67333.70489449863, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6089.519108001696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23449.42444652001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 120.69910238930122, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 17718.90502399947, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 89.65837299911072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 262.3360423793547, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 55.06700430067017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 279.50231642284837, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 48.04969955440588, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 06:05:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18693.89975500053, + "value": 18383.634987500045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1106.037561508017, + "value": 977.7821168053584, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 234.87662999832537, + "value": 233.23141549917636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 162.23267729135304, + "value": 161.4755453013489, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 149.22055482149233, + "value": 147.98108700900505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61705.24036599909, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 78107.6898225001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39382.883654935344, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 58225.91374190133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 38853.11500399985, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 71513.63180799945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 98.6048637101342, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 67.80498533429089, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 100.93879064096944, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 65.44014757829031, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:27:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6106.1727380001685, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 66338.3866639997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 122.2248624533313, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 22944.870291643343, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 82.13618149875401, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 17367.390752000574, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.866295219973715, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 260.65817349493784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.483253903939236, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 277.99167895448284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:47:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1841.4298955003687, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2134.466259499277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.76224013333679, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 136.62190305008457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 59.39676350044465, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 102.67750049933966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12.53072772650871, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 15.099409057440004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.7504803795625, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 13.654440891257662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:24:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1928.7328335003622, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2408.684269999867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 79.24429737999162, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 116.16479182931411, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 38.83016149939067, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 80.27312500053085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.01682881146633, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 18.106101958196053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.104867589473509, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 16.139953761015917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:18:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6375.923833499655, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 14822.945272499965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 109.70417828670179, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 205.64598047202162, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 70.59495649991732, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 163.80199600098422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.29284477344846, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 119.56480416917998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.98958335851306, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 126.0499936903422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:02:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 260014.52012900062, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 7886.070769500293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 244655.706315512, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 157.57009318131654, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 246336.86162399955, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 132.7795765000701, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 70.77495113818273, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 58.182222314526086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66.8261656403469, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 57.73655366013833, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 3645.2240139988135, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6141.3096454998595, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 143.44058583311076, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 104.48978885987647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 94.05171199978213, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 67.92814349955734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24.43651480224753, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 34.73078370484817, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23.768530231824148, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 35.22087641082219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:14:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6056.600670500302, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 259394.88207350043, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 128.3224846033469, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 244259.34381698666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 87.35010950022115, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 245936.9741119999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.54763684967656, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 70.70314073132613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.42104585396235, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 66.68409539982376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 04:12:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6055.8608940009435, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 6071.920400999261, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 122.8573354986826, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 121.85457794001802, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 92.14041750055912, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 78.61482199950842, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 55.07886411780935, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.801395855077615, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 48.0826652607532, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39.502406777722285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:51:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66309.31746399801, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 63004.24120249954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 29019.913823834024, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 40527.186443927385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 27652.122268498715, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 39565.95233650023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 188.22594207093397, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 99.86915523887204, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 199.4164384190758, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 102.12643246719269, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-29 06:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2122.4544235001304, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 2066.5399399995295, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136.2453950600563, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 116.18510289994447, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 104.65170849965943, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 86.63544850060134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 15.058391072166453, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.6430866408856, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13.626770453407055, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", + "value": 11.652037467004137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-29 05:50:07 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Andy Linfoot", - "username": "andy-neuma", - "email": "78757007+andy-neuma@users.noreply.github.com" + "name": "Robert Shaw", + "username": "robertgshaw2-neuralmagic", + "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "df1f1a00d1fb111ef035ac385fafa38b5ed34488", - "message": "switch to GCP based build VM (#201)\n\nSUMMARY:\r\n* switch over to GCP VM's for building stage of \"remote push\"\r\n\r\nNOTE: this is just the start. i'll redo the benchmarking and nightly\r\nworkflows in an upcoming PR.\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\nCo-authored-by: andy-neuma ", - "timestamp": "2024-04-23T14:46:41Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488" + "id": "d485d3e5c9721b27cd0fe345d062fabc038049a1", + "message": "Marlin 2:4 Downstream (for v0.3 release) (#239)\n\nSupport marlin 2:4 in downstream so we can have it in the release", + "timestamp": "2024-05-14T01:54:06Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/d485d3e5c9721b27cd0fe345d062fabc038049a1" }, - "date": 1714470075644, + "date": 1715671992207, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5376.062333001755, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7699.20140399995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 184.5827273187315, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.60859181600426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 173.52721449969977, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.44680549999521, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.43196687663791, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.35804097393765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.58095798642143, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.06640501978451, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:24:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12320.661744000063, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14952.524137000182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 200.01820418526768, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.97269461333295, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 169.44416600108525, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.41449899988947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 96.04773220335846, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.74922727076351, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 101.17233322299779, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.51590181598549, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:55:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 3652.386217499952, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3631.2452459997075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 142.93804023662233, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.76172412334503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 95.80902800007607, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.35134399993694, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24.498444220468098, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.310515078398957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23.784562093061254, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.651293949528196, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:14:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1823.0353210010435, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5856.753418500375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 93.06960186000651, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.7610248973045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 59.665249999852676, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.27816299910774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12.502380206719828, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.309391464699814, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.783106806446016, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.510393959855044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:22:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 5238.431358498929, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16020.202677499583, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 91.44519897992723, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 253.06591161863494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 60.88910249854962, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.56535249925219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 37.252881617494005, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.14549104973028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 31.70269005355473, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 134.71175487648534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:42:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 1929.8616509995554, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2412.483122499907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 79.59311796000596, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.41906162533633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.459064500078966, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.04159249973964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 10.979780574097273, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.954717798726882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.09183504894853, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.191531535676557, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:16:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65573.8574650004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1928.175570500116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 28950.78831299604, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.48966796332327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 28142.69032300217, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.23218350042225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 188.86090202522692, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.258276658293926, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 199.33138919321786, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.51521060171141, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:34:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6378.783236000345, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6352.16404099998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 108.38548640673253, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.16861786666747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 71.50023299982422, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.68160549998947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.32068798008349, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.22538058499631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 36.99483870421225, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.91675834190811, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:00:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6047.278685499805, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1940.8526524998706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 125.4039507166696, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.21267474664396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 87.60409249862278, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.89376600020478, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.662599829984984, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.089562012528129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40.43363331252289, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.173363486138213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:11:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6089.519108001696, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63194.19884299987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 120.69910238930122, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 27375.426378377993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 89.65837299911072, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25736.021327001254, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 55.06700430067017, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.86486330113644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 48.04969955440588, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.56442768654986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:51:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18383.634987500045, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 75939.70493999995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 977.7821168053584, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56341.681981972004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 233.23141549917636, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69316.22169849992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 161.4755453013489, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.55430329170304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 147.98108700900505, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.33734177531761, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-30 06:58:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 78107.6898225001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2033.819562499957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58225.91374190133, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.19037280665968, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 71513.63180799945, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.89878250038237, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67.80498533429089, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.560121123356186, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 65.44014757829031, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.691221148500436, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:54:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66338.3866639997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2571.6090910000275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 22944.870291643343, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.15518762534703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 17367.390752000574, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.6943180002454, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 260.65817349493784, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.216760619551575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 277.99167895448284, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.232633518539803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 06:04:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2134.466259499277, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5996.506612500014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136.62190305008457, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.50568606667495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 102.67750049933966, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.20934350001153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 15.099409057440004, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.21830872028567, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13.654440891257662, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.20192526693389, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:48:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2408.684269999867, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5082.261738499255, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.16479182931411, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.498999799934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 80.27312500053085, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.69171000035567, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 18.106101958196053, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.21554315710666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 16.139953761015917, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.829622089502823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:28:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14822.945272499965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13589.161211999908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 205.64598047202162, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.4137675746612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 163.80199600098422, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.45165350012758, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 119.56480416917998, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.80662248274147, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 126.0499936903422, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.48762767348046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:35:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7886.070769500293, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 254818.33761899997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 157.57009318131654, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241011.8444389933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 132.7795765000701, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242041.27989450013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 58.182222314526086, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.88961185379736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 57.73655366013833, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.35204241527413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:17:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6141.3096454998595, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60430.702900000026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 104.48978885987647, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38304.701958351994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67.92814349955734, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37548.55232750003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 34.73078370484817, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.69063523534015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 35.22087641082219, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.23740548629765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:36:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 259394.88207350043, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6053.155197000024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 244259.34381698666, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.90334264000285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 245936.9741119999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.16031500003373, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 70.70314073132613, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.63900264773282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 66.68409539982376, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.268328228789144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:09:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6071.920400999261, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5320.154192500013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 121.85457794001802, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.2350845293695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 78.61482199950842, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.38535149933887, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.801395855077615, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.67783502514967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.502406777722285, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.79312972752162, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:46:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 63004.24120249954, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1828.8207245000194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40527.186443927385, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.21822712335475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39565.95233650023, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.458014999861916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 99.86915523887204, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.639844913203472, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 102.12643246719269, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.911983000670325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 04:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 2066.5399399995295, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6098.667970999941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.18510289994447, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.38908700000866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 86.63544850060134, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.73955900007422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.6430866408856, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.51027741501625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 11.652037467004137, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.03588377959485, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-30 05:42:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Robert Shaw", - "username": "robertgshaw2-neuralmagic", - "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com" + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "dbarbuzzi@gmail.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "d485d3e5c9721b27cd0fe345d062fabc038049a1", - "message": "Marlin 2:4 Downstream (for v0.3 release) (#239)\n\nSupport marlin 2:4 in downstream so we can have it in the release", - "timestamp": "2024-05-14T01:54:06Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/d485d3e5c9721b27cd0fe345d062fabc038049a1" + "id": "3a2545670126854a4a685edd889fe68f2fe250c3", + "message": "Misc CI/CD updates (#240)", + "timestamp": "2024-05-14T14:41:26Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/3a2545670126854a4a685edd889fe68f2fe250c3" }, - "date": 1715671992207, + "date": 1715759306117, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7699.20140399995, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6101.23209849985, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.60859181600426, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.81138589331081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.44680549999521, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.00526649986205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.35804097393765, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.55243954826612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.06640501978451, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.05923950958349, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:31:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14952.524137000182, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7680.819542500103, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.97269461333295, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.6045172239992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.41449899988947, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.96245799999906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.74922727076351, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.10173666541715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.51590181598549, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.79671118557445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:10:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3631.2452459997075, + "value": 3649.93440049966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.76172412334503, + "value": 142.4122476466558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.35134399993694, + "value": 92.2783384999093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.310515078398957, + "value": 24.36573514002923, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.651293949528196, + "value": 23.69197726093354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:20:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5856.753418500375, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 254531.2522874997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.7610248973045, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240153.88880120867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.27816299910774, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241515.4948259999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.309391464699814, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.14310567244951, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.510393959855044, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.18372669169628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:57:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16020.202677499583, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6043.307231999961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 253.06591161863494, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.88578150666443, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.56535249925219, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.40760800003955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.14549104973028, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.35962738190891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 134.71175487648534, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.23074240547815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 05:04:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2412.483122499907, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6041.992879999725, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.41906162533633, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.46525663999304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.04159249973964, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.24388900003032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.954717798726882, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.55438731048751, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.191531535676557, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.16477230530249, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:37:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1928.175570500116, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1942.0991919996595, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.48966796332327, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.98660993330243, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.23218350042225, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.880631999996695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.258276658293926, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.054870860215237, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.51521060171141, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.121548043603779, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:58:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6352.16404099998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5049.914612500288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.16861786666747, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.22882873998849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.68160549998947, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.91983600021922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.22538058499631, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.16473733373438, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.91675834190811, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.799053717961876, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:18:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1940.8526524998706, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59942.670705500066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.21267474664396, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38040.75315246733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.89376600020478, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37545.920685000056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.089562012528129, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.56689999797372, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.173363486138213, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.25388810633912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:25:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63194.19884299987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2411.809580999943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27375.426378377993, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.5320580693345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25736.021327001254, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.90481249953518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.86486330113644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.94385816744697, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.56442768654986, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.169444361253195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:40:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 75939.70493999995, + "value": 76860.2623894999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56341.681981972004, + "value": 57203.97092948268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69316.22169849992, + "value": 70173.28971250003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.55430329170304, + "value": 67.93434489798318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.33734177531761, + "value": 64.58440380578197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:04:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2033.819562499957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5330.106188499485, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.19037280665968, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.17156377735228, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.89878250038237, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.8889075002371, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.560121123356186, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.881822445030636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.691221148500436, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.93948749345439, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:51:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2571.6090910000275, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1920.0154015002227, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.15518762534703, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.74568158999682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.6943180002454, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.970186999838916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.216760619551575, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.202795011221378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.232633518539803, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.448855716742237, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5996.506612500014, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15703.07831700029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.50568606667495, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 224.3258602553542, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.20934350001153, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 150.11075299935328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.21830872028567, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.76176657570113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.20192526693389, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.821358348712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:24:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5082.261738499255, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14704.967883500103, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.498999799934, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 215.22999095600426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.69171000035567, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 167.70195599974613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.21554315710666, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.65424397527357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.829622089502823, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.98717967737052, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:49:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13589.161211999908, + "value": 13476.068847999613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.4137675746612, + "value": 199.25043000799937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.45165350012758, + "value": 154.84395100020265, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.80662248274147, + "value": 105.13777527980294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.48762767348046, + "value": 109.68111962814491, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:44:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 254818.33761899997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61124.359150999226, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241011.8444389933, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26456.67674058268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242041.27989450013, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24940.77680600003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.88961185379736, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.534421332363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.35204241527413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.434688678055, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:18:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60430.702900000026, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5879.7700890008855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38304.701958351994, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.73317806397729, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37548.55232750003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.43678700011515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.69063523534015, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.14737516562541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.23740548629765, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.20036354770608, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:39:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6053.155197000024, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1818.5470739995253, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.90334264000285, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.5462979966657, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.16031500003373, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.357280499923945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.63900264773282, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.59746519769587, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.268328228789144, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.822087371719496, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:55:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5320.154192500013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6362.97378599994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.2350845293695, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.44010315333193, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.38535149933887, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.66087950004385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.67783502514967, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.251718120805435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.79312972752162, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.98496586314468, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-14 04:30:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1828.8207245000194, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2032.796726000015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.21822712335475, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.79107167337497, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.458014999861916, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.870010999948136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.639844913203472, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.590794727441683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.911983000670325, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.656013017847151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 03:31:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6098.667970999941, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2563.39438949999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.38908700000866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.95549527066639, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.73955900007422, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.83280099997137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.51027741501625, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.130742047195096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.03588377959485, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.251971973906564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-14 02:48:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Domenic Barbuzzi", - "username": "dbarbuzzi", - "email": "dbarbuzzi@gmail.com" + "name": "dhuangnm", + "username": "dhuangnm", + "email": "74931910+dhuangnm@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "3a2545670126854a4a685edd889fe68f2fe250c3", - "message": "Misc CI/CD updates (#240)", - "timestamp": "2024-05-14T14:41:26Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/3a2545670126854a4a685edd889fe68f2fe250c3" + "id": "6334dd3bfbd72d30e969bd02edb1280cad5af5a2", + "message": "bump version to 0.3.0 (#241)\n\nbump version to 0.3.0", + "timestamp": "2024-05-15T13:23:12Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/6334dd3bfbd72d30e969bd02edb1280cad5af5a2" }, - "date": 1715759306117, + "date": 1715849787199, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6101.23209849985, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258118.46169200022, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.81138589331081, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242367.391415986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.00526649986205, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244689.68778350018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.55243954826612, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.69259278402636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.05923950958349, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.36323963001757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:02:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7680.819542500103, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3638.984193500164, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.6045172239992, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141.95815483667502, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.96245799999906, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.62860549981633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.10173666541715, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.342034556891115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.79671118557445, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.64824972562455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:45:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3649.93440049966, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59371.91879149998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.4122476466558, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37311.768499666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.2783384999093, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36652.779958999985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.36573514002923, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.33364648464519, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.69197726093354, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.55867090197547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:35:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 254531.2522874997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13685.08536049967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240153.88880120867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 200.8336691906637, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241515.4948259999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.36061550044178, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.14310567244951, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.19594483218475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.18372669169628, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.39465867675736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:33:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6043.307231999961, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1819.5840399998815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.88578150666443, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.42182694997305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.40760800003955, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.673415999692224, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.35962738190891, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.590435891314716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.23074240547815, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.866962514121967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:39:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6041.992879999725, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6361.116683499972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.46525663999304, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.36964258666406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.24388900003032, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.05765199994585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.55438731048751, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.20724675453855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.16477230530249, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.91689382031487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:10:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1942.0991919996595, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5338.608900499821, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.98660993330243, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.17088898930766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.880631999996695, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 175.92986100044072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.054870860215237, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.62952981019738, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.121548043603779, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.958026242584666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:40:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5049.914612500288, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6099.107161499887, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.22882873998849, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.43499850663466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.91983600021922, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.7180684999239, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.16473733373438, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.54700459908418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.799053717961876, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.045846241846135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:03:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59942.670705500066, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63258.93250450008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38040.75315246733, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 27231.074444095368, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37545.920685000056, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25326.550558000235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.56689999797372, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 186.14266411794483, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.25388810633912, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.1091215325504, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:53:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2411.809580999943, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2029.6977950006294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.5320580693345, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.6466327933352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.90481249953518, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.70703700012746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.94385816744697, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.599106473887705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.169444361253195, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.677298393022797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:52:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76860.2623894999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2394.628295000075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57203.97092948268, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.39873904799848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70173.28971250003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.23161299999992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.93434489798318, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.90703764958482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.58440380578197, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.11724712321176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:19:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5330.106188499485, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14672.161679499823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.17156377735228, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.41412069200305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.8889075002371, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.149260500144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.881822445030636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.7828334274791, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.93948749345439, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.60739112711852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:45:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1920.0154015002227, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5051.6441065001345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.74568158999682, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.2713155733339, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.970186999838916, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.221417499335075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.202795011221378, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.14609481401333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.448855716742237, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.811671150266346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:12:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15703.07831700029, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7640.926798500004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 224.3258602553542, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.8186790013333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 150.11075299935328, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.10266649998903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.76176657570113, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.98906212683416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.821358348712, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.57185544198197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:19:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14704.967883500103, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1921.5456125002675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 215.22999095600426, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.37519164667174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 167.70195599974613, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.093453000059526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.65424397527357, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.16933657139751, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.98717967737052, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.475328412495637, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:25:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13476.068847999613, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15725.322180499461, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 199.25043000799937, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.28753929002778, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.84395100020265, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.56340050007566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.13777527980294, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.2820009003947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.68111962814491, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.30119400852809, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:59:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61124.359150999226, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5888.8233325005785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26456.67674058268, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.89838669599703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24940.77680600003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.66479900057311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.534421332363, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.12787558749585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.434688678055, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.272510246963996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 04:55:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5879.7700890008855, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2589.057005000086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.73317806397729, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.58779895730913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.43678700011515, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.76591749993167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.14737516562541, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.13812961430733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.20036354770608, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.174284445696333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-15 05:12:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1818.5470739995253, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6056.139924999969, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.5462979966657, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.55583270999773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.357280499923945, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.00014349999219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.59746519769587, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.448909689841884, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.822087371719496, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.259250252047636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 03:46:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6362.97378599994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6027.659153499826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.44010315333193, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.16549665333878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.66087950004385, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.90766700009044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.251718120805435, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.63301758757746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.98496586314468, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.29818135093642, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 02:33:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2032.796726000015, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 75949.34716449984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.79107167337497, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56493.18321569867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.870010999948136, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69423.33838600008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.590794727441683, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.62640358472382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.656013017847151, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.55388043976367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:06:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2563.39438949999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1937.9856095001742, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.95549527066639, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.23790262007363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.83280099997137, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43.146204000549915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.130742047195096, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.083233474550786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.251971973906564, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.149322394807717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-15 04:19:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" + "name": "Michael Goin", + "username": "mgoin", + "email": "michael@neuralmagic.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "6334dd3bfbd72d30e969bd02edb1280cad5af5a2", - "message": "bump version to 0.3.0 (#241)\n\nbump version to 0.3.0", - "timestamp": "2024-05-15T13:23:12Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/6334dd3bfbd72d30e969bd02edb1280cad5af5a2" + "id": "59cf939c70173f2419d143a738505180c37465a4", + "message": "Add FP8 and marlin 2:4 tests for lm-eval (#244)", + "timestamp": "2024-05-16T20:43:49Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/59cf939c70173f2419d143a738505180c37465a4" }, - "date": 1715849787199, + "date": 1715931046658, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258118.46169200022, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6168.269709499782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242367.391415986, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.98510685333773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244689.68778350018, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.63699749987973, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.69259278402636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.674676269909696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.36323963001757, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.19037892508264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:41:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15854.127336500824, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 229.12089890001153, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.03108400025667, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.8418386060145, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.76484605535293, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3638.984193500164, + "value": 3638.4393895000358, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 141.95815483667502, + "value": 143.43216443332494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.62860549981633, + "value": 91.1189015000673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.342034556891115, + "value": 24.13681476256774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.64824972562455, + "value": 23.569469998782356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59371.91879149998, + "value": 59962.44221149993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37311.768499666, + "value": 37740.73947849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36652.779958999985, + "value": 37190.24467849999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.33364648464519, + "value": 97.62116391947876, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.55867090197547, + "value": 99.52574322834293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:01:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13685.08536049967, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6363.175155999954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 200.8336691906637, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.46530145999759, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.36061550044178, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.22012699998459, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.19594483218475, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.18218308004019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.39465867675736, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.90844879031058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:07:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1819.5840399998815, + "value": 1829.1798820000622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.42182694997305, + "value": 92.8089847133136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.673415999692224, + "value": 55.889288499656686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.590435891314716, + "value": 12.61356858195317, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.866962514121967, + "value": 11.88905354923614, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:54:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6361.116683499972, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 254069.14275899975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.36964258666406, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240695.88777094072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.05765199994585, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240912.15783250003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.20724675453855, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.65105717837609, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.91689382031487, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.43780510097852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:40:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5338.608900499821, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6011.775941000053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.17088898930766, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.2050380700025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 175.92986100044072, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.94614950008872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.62952981019738, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.3468362321537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.958026242584666, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.33262726330132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 05:52:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6099.107161499887, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2410.191408999708, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.43499850663466, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.48227018665541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.7180684999239, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.24950499995975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.54700459908418, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.93705259664011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.045846241846135, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.160367230376785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:10:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63258.93250450008, + "value": 63109.12443750021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27231.074444095368, + "value": 27099.457985118017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25326.550558000235, + "value": 24794.5113629994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 186.14266411794483, + "value": 186.17630149679047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.1091215325504, + "value": 197.59613802394037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:03:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2029.6977950006294, + "value": 2033.1724739999117, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.6466327933352, + "value": 81.60828334669229, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.70703700012746, + "value": 39.80177150015152, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.599106473887705, + "value": 11.668408356669811, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.677298393022797, + "value": 11.696979096307233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:14:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2394.628295000075, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5098.723483000867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.39873904799848, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.27253446010339, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.23161299999992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.85428400001547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.90703764958482, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.22974063171822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.11724712321176, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.82210335211133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:00:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14672.161679499823, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1938.1165569998302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.41412069200305, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.63462712665812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.149260500144, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.184574000249995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.7828334274791, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.062266717402675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.60739112711852, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.160580846799421, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:33:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5051.6441065001345, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13524.57672249966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.2713155733339, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 198.33103487866478, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.221417499335075, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.28295849955975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.14609481401333, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.35815688233247, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.811671150266346, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.10263256790182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:11:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7640.926798500004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5304.019143500227, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.8186790013333, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.5558509067002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.10266649998903, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.9265964997685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.98906212683416, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.72565584426561, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.57185544198197, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.897744546202475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:53:01 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1921.5456125002675, + "value": 1921.0260259997085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.37519164667174, + "value": 95.76962023666056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.093453000059526, + "value": 58.09594550009933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.16933657139751, + "value": 13.251560110156246, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.475328412495637, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:20:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15725.322180499461, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.28753929002778, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.56340050007566, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.2820009003947, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.30119400852809, + "value": 12.549780864286399, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:27:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5888.8233325005785, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76498.90343950005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.89838669599703, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56876.460692046676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.66479900057311, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69992.24383849991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.12787558749585, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.16335240767877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.272510246963996, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.58964462050396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-16 06:20:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2589.057005000086, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7698.409701499941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.58779895730913, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.52133288799632, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.76591749993167, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.80558999997993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.13812961430733, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.29115957646765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.174284445696333, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.01315399741855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 05:26:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6056.139924999969, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14792.707703500128, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.55583270999773, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 218.14901325201268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.00014349999219, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.01783900009104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.448909689841884, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.7628959969548, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.259250252047636, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.36463364552219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 03:46:28 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6027.659153499826, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2575.180585000453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.16549665333878, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.64534286131433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.90766700009044, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.11354049985675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.63301758757746, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.257292271003042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.29818135093642, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.321459875944225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:17:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 75949.34716449984, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5891.935765999733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56493.18321569867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.47207699864036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69423.33838600008, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.99214000030042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.62640358472382, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.53983529779377, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.55388043976367, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.422655616102965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:26:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1937.9856095001742, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6054.575040000145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.23790262007363, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.50609336336493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 43.146204000549915, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.48204099989198, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.083233474550786, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.69021046032865, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.149322394807717, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.28364630012998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-16 04:48:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Michael Goin", - "username": "mgoin", - "email": "michael@neuralmagic.com" + "name": "Andy Linfoot", + "username": "andy-neuma", + "email": "78757007+andy-neuma@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "59cf939c70173f2419d143a738505180c37465a4", - "message": "Add FP8 and marlin 2:4 tests for lm-eval (#244)", - "timestamp": "2024-05-16T20:43:49Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/59cf939c70173f2419d143a738505180c37465a4" + "id": "d69a34a3194efb8dd34c1f293af91aac19b5b992", + "message": "updates for nm-magic-wand, nightly or release (#247)\n\nSUMMARY:\r\n* update GHA action `nm-build-vllm` to not install\r\n`nm-magic-wand-nightly`\r\n* update build script to not install `nm-magic-wand-nightly` (we might\r\nconsider getting rid of this script altogether, since we aren't really\r\nusing it)\r\n* remove unused GHA action `nm-test-vllm`. this has been superseded by\r\n`nm-install-test-whl`\r\n* update GHA action `nm-install-test-whl` to get version of\r\n`nm-magic-wand` if `nm-magic-wand-nightly` is not present\r\n* update `setup.py` to default generate \"nightly\" package and add option\r\nbased on ENV to generate release package. this also includes managing\r\nthe dependency on `nm-magic-wand`.\r\n* update `set-env` action to set ENV based on `wf_category` input\r\n* update \"release\" workflow to include all supported python versions\r\n* delete obsolete \"gen-whl\"\r\n\r\nNOTES:\r\n- \"magic-wand\" is only a runtime dependency, so no need to install it\r\nduring build phase.\r\n- this PR makes it so that we by default generate a \"nightly\" package\r\nwith a \"nightly\" version number. if we want to generate a release\r\npackage we'll need to specify `wf_category` as `RELEASE`.\r\n\r\nTEST PLAN:\r\nruns on remote push. verifying that `wf_category` set to `RELEASE` will\r\ngenerate appropriate package.\r\n\r\nran \"build\" workflow with `wf_category` set to `RELEASE` ... package\r\nlooks properly named and versioned ...\r\nhttps://github.com/neuralmagic/nm-vllm/actions/runs/9129675592\r\n\r\nthe \"remote push\" defaulted to generating a \"nightly\" package ... please\r\nsee ... https://github.com/neuralmagic/nm-vllm/actions/runs/9129665988\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma ", + "timestamp": "2024-05-17T18:21:50Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992" }, - "date": 1715931046658, + "date": 1716017339879, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6168.269709499782, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14810.016069499852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.98510685333773, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 218.56008913000855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.63699749987973, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.22120100016036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.674676269909696, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.2988079861909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.19037892508264, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.42463874072129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:44:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15854.127336500824, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6047.7593895000155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 229.12089890001153, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.70249681000102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.03108400025667, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.60992200002693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.8418386060145, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.64234790472506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.76484605535293, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.30521584813457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 05:00:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3638.4393895000358, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13637.514877500507, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.43216443332494, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 199.83664511266275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.1189015000673, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.96504800027833, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.13681476256774, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.6839072689676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.569469998782356, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.64681703876947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59962.44221149993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6361.296463499968, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37740.73947849, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.71057482666917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37190.24467849999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.16868700000123, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.62116391947876, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.19780760871951, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.52574322834293, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.92559944563227, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:35:32 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6363.175155999954, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59489.59240800002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.46530145999759, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37368.98345445866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.22012699998459, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37067.69261349996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.18218308004019, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.26940382136384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.90844879031058, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.56675985177498, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:14:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1829.1798820000622, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62056.8363225002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.8089847133136, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26855.629502783344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.889288499656686, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25377.36457000028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.61356858195317, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.2058512415153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.88905354923614, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 196.33672283094916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:27:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 254069.14275899975, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2027.6393539998026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240695.88777094072, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.96425516666082, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240912.15783250003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.60456700017312, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.65105717837609, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.566972347454207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.43780510097852, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.650799858896228, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:14:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6011.775941000053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5088.906428999508, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.2050380700025, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.13272269999531, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.94614950008872, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.99242449990561, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.3468362321537, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.22368878213732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.33262726330132, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.838412958535734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:20:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2410.191408999708, + "value": 2443.2510500000717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.48227018665541, + "value": 115.30383123866584, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.24950499995975, + "value": 80.4688204998456, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.93705259664011, + "value": 18.031387206200954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.160367230376785, + "value": 16.145742038947475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63109.12443750021, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 254335.7819769999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27099.457985118017, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 239884.11322115734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24794.5113629994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241095.04781499982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 186.17630149679047, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.82444884384628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.59613802394037, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.901508156746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:36:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2033.1724739999117, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6105.448188500077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.60828334669229, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.28023722000701, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.80177150015152, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.17037599987452, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.668408356669811, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.56989417257731, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.696979096307233, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.099051351825665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:47:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5098.723483000867, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3638.254859999961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.27253446010339, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.63412806328537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.85428400001547, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.39277999970363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.22974063171822, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.245417164670172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.82210335211133, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.581465649564212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:44:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1938.1165569998302, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15878.910925000127, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.63462712665812, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257.79683584202337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.184574000249995, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.93650999987585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.062266717402675, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.3690193086413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.160580846799421, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 134.16699667579783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13524.57672249966, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5263.1742175008185, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 198.33103487866478, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.06313942133661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.28295849955975, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.37223499958054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.35815688233247, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.361488833984794, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.10263256790182, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.77556198029057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:40:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5304.019143500227, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2584.6181110000543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.5558509067002, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.38421627068116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.9265964997685, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.61264300037874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.72565584426561, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.207850582925126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.897744546202475, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.259956566071594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1921.0260259997085, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1824.3145765000008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.76962023666056, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.51873117000287, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.09594550009933, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.77562600021702, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.251560110156246, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.61187830148239, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.549780864286399, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.88771671636012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:54:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76498.90343950005, + "value": 76891.4737555001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56876.460692046676, + "value": 57436.928021955995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69992.24383849991, + "value": 70402.43751549997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.16335240767877, + "value": 66.47794981294955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.58964462050396, + "value": 65.12885841910457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 03:00:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7698.409701499941, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1922.217876999639, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.52133288799632, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.57471203667167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.80558999997993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.548126000161574, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.29115957646765, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.195607250386715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.01315399741855, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.492856398193746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:27:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14792.707703500128, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1940.8474389997536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 218.14901325201268, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.07058909334714, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.01783900009104, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.778589500452654, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.7628959969548, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.09075655858916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.36463364552219, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.185356074012214, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:06:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2575.180585000453, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5820.976581500872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.64534286131433, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.63150633461433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.11354049985675, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.11294800034375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.257292271003042, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.02006540933792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.321459875944225, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.0881008061641, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 04:00:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5891.935765999733, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7682.707939000011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.47207699864036, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.30934739999762, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.99214000030042, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.49294300001657, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.53983529779377, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.15492825600079, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.422655616102965, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.823160924864695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-17 04:53:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6054.575040000145, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6018.421747500042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.50609336336493, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.63125606333186, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.48204099989198, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.89627549998113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.69021046032865, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.390901221479396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.28364630012998, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.30476443296476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-17 02:51:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -73766,668 +73766,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-17T18:21:50Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992" }, - "date": 1716017339879, + "date": 1716104023513, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14810.016069499852, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61655.41962700081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 218.56008913000855, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26528.746806866013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.22120100016036, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24690.114835500026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.2988079861909, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.86608919589457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.42463874072129, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 196.17474513994708, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:07:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6047.7593895000155, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6044.677653000008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.70249681000102, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.03815819000427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.60992200002693, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.75302800009376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.64234790472506, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.424346910297224, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.30521584813457, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.21783580931318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:52:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13637.514877500507, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1921.6672434999964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 199.83664511266275, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.78780205332501, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.96504800027833, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.91459850015235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.6839072689676, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.26898181958198, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.64681703876947, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.542263680108311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:40:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6361.296463499968, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6127.92181750001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.71057482666917, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.16290845333363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.16868700000123, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.89771300009306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.19780760871951, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.56511063377084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.92559944563227, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.08658959372397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:14:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59489.59240800002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1824.861421499918, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37368.98345445866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.41041380330292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37067.69261349996, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.37233400052355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.26940382136384, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.626507907901109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.56675985177498, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.886236209687796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:35:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62056.8363225002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3635.1900660001775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26855.629502783344, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.38315672996941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25377.36457000028, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.76733099993362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.2058512415153, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.280647590190572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 196.33672283094916, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.582746199784975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:36:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2027.6393539998026, + "value": 2033.8978135005163, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.96425516666082, + "value": 80.8639452733102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.60456700017312, + "value": 39.51642699985314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.566972347454207, + "value": 11.673895722857624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.650799858896228, + "value": 11.69115451814601, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:47:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5088.906428999508, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5938.327654000204, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.13272269999531, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.85603009857974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.99242449990561, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.8111569995308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.22368878213732, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.32193032637963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.838412958535734, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.33472051666922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:45:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2443.2510500000717, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5083.483950499613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.30383123866584, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.46125785995778, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.4688204998456, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.44288049967872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.031387206200954, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.223061952446045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.145742038947475, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.817517198938685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:34:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 254335.7819769999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6368.015514000035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 239884.11322115734, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.74491108000059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241095.04781499982, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.00241100001858, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.82444884384628, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.20411600305019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.901508156746, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.93004425265935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:15:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6105.448188500077, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14781.194467000205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.28023722000701, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 219.56516076601233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.17037599987452, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.35813100028463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.56989417257731, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.4530258622076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.099051351825665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.81759093661918, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:44:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3638.254859999961, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16121.083815500242, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.63412806328537, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 275.79642830399223, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.39277999970363, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.28274150044308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.245417164670172, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.05415411786547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.581465649564212, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.22412523832307, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:16:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15878.910925000127, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2584.5175194999683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257.79683584202337, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.96530007466815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.93650999987585, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.06174400038435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.3690193086413, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.208495283912516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 134.16699667579783, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.280756653462735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 05:00:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5263.1742175008185, + "value": 5309.328878999622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.06313942133661, + "value": 182.55137791065258, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.37223499958054, + "value": 174.64112149991706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.361488833984794, + "value": 39.58673908159116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.77556198029057, + "value": 36.90731126948433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:26:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2584.6181110000543, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7728.100629999972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.38421627068116, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.62900196932907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.61264300037874, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.0928509999485, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.207850582925126, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.93319605515613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.259956566071594, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.58694876708374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 04:00:28 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1824.3145765000008, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76756.42494399994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.51873117000287, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57101.352517459985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.77562600021702, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69993.83625249992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.61187830148239, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.00693326917761, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.88771671636012, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:28:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.64969922823357, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76891.4737555001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13629.244735499924, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57436.928021955995, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 205.90700912799548, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70402.43751549997, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.40073799972743, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.47794981294955, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.67080802803109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.12885841910457, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.6921246030719, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:00:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1922.217876999639, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 255002.6453329999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.57471203667167, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240243.91990748668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.548126000161574, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241767.39791549972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.195607250386715, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.7344433797077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.492856398193746, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.95666295226302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:54:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1940.8474389997536, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2445.613765499729, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.07058909334714, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.16249440534011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.778589500452654, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.27358949989139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.09075655858916, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.96014548999124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.185356074012214, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.162734591422716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 03:22:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5820.976581500872, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1941.047039500063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.63150633461433, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.00893331999518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.11294800034375, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.485431000230164, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.02006540933792, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.129155345630748, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.0881008061641, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.12759747485139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-18 04:53:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7682.707939000011, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6071.857499000089, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.30934739999762, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.62412981668572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.49294300001657, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.97832849989572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.15492825600079, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.80621436769108, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.823160924864695, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.397060414084116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:27:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6018.421747500042, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59540.021137000054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.63125606333186, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37168.83478003866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.89627549998113, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36646.56828200009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.390901221479396, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.34496673627908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.30476443296476, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.2032687169866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-18 02:21:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -74443,673 +74443,673 @@ window.BENCHMARK_DATA = { "username": "web-flow", "email": "noreply@github.com" }, - "id": "d69a34a3194efb8dd34c1f293af91aac19b5b992", - "message": "updates for nm-magic-wand, nightly or release (#247)\n\nSUMMARY:\r\n* update GHA action `nm-build-vllm` to not install\r\n`nm-magic-wand-nightly`\r\n* update build script to not install `nm-magic-wand-nightly` (we might\r\nconsider getting rid of this script altogether, since we aren't really\r\nusing it)\r\n* remove unused GHA action `nm-test-vllm`. this has been superseded by\r\n`nm-install-test-whl`\r\n* update GHA action `nm-install-test-whl` to get version of\r\n`nm-magic-wand` if `nm-magic-wand-nightly` is not present\r\n* update `setup.py` to default generate \"nightly\" package and add option\r\nbased on ENV to generate release package. this also includes managing\r\nthe dependency on `nm-magic-wand`.\r\n* update `set-env` action to set ENV based on `wf_category` input\r\n* update \"release\" workflow to include all supported python versions\r\n* delete obsolete \"gen-whl\"\r\n\r\nNOTES:\r\n- \"magic-wand\" is only a runtime dependency, so no need to install it\r\nduring build phase.\r\n- this PR makes it so that we by default generate a \"nightly\" package\r\nwith a \"nightly\" version number. if we want to generate a release\r\npackage we'll need to specify `wf_category` as `RELEASE`.\r\n\r\nTEST PLAN:\r\nruns on remote push. verifying that `wf_category` set to `RELEASE` will\r\ngenerate appropriate package.\r\n\r\nran \"build\" workflow with `wf_category` set to `RELEASE` ... package\r\nlooks properly named and versioned ...\r\nhttps://github.com/neuralmagic/nm-vllm/actions/runs/9129675592\r\n\r\nthe \"remote push\" defaulted to generating a \"nightly\" package ... please\r\nsee ... https://github.com/neuralmagic/nm-vllm/actions/runs/9129665988\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma ", - "timestamp": "2024-05-17T18:21:50Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/d69a34a3194efb8dd34c1f293af91aac19b5b992" + "id": "93183d6b21fd1f42fc2a98b93e46fca0c5530b40", + "message": "increase timeouts (#253)\n\nSUMMARY:\r\n* increase \"RELEASE\" workflow timeouts to 12 hours\r\n\r\nTEST PLAN:\r\nwill cherry pick and trigger job manually on release, `v0.3.0`\r\n\r\nCo-authored-by: andy-neuma ", + "timestamp": "2024-05-20T21:44:29Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40" }, - "date": 1716104023513, + "date": 1716276873695, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61655.41962700081, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5888.232021000476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26528.746806866013, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.28707645070001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24690.114835500026, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.33425299933151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.86608919589457, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.37600865038578, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 196.17474513994708, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.240458468117346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:42:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6044.677653000008, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1851.7477995001173, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.03815819000427, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.63186055335609, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.75302800009376, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.01871750029386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.424346910297224, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.64062083609474, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.21783580931318, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.908801976978634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:26:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1921.6672434999964, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62561.73622250026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.78780205332501, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26827.57049859066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.91459850015235, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24709.98807849992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.26898181958198, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 186.2852890606116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.542263680108311, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.76614333508624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:59:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6127.92181750001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76591.36254999999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.16290845333363, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56769.70654236267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.89771300009306, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69590.44582899992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.56511063377084, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.06704888151754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.08658959372397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.64701087512805, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:49:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1824.861421499918, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1948.544970999592, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.41041380330292, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.7087402533507, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.37233400052355, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.53587200021502, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.626507907901109, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.09703730208491, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.886236209687796, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.17204182207614, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:33:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3635.1900660001775, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15923.402555499706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.38315672996941, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245.11491717732738, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.76733099993362, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.51965949989244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.280647590190572, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.06644579933197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.582746199784975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.87246110308337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:21:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2033.8978135005163, + "value": 2035.2116740000383, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.8639452733102, + "value": 81.47769093337047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.51642699985314, + "value": 40.839236000010715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.673895722857624, + "value": 11.604968096061906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.69115451814601, + "value": 11.710429787194647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:52:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5938.327654000204, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6043.739634999952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.85603009857974, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.0259374199946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.8111569995308, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.7043535000912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.32193032637963, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.47326180115146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.33472051666922, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.24005488804747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:58:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5083.483950499613, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6097.635424999908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.46125785995778, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.50908731998727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.44288049967872, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.17343300005996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.223061952446045, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.57012586129777, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.817517198938685, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.0636363200751, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:50:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6368.015514000035, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59794.577934000015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.74491108000059, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37813.855549205335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.00241100001858, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37292.644641000035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.20411600305019, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.51843281166308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.93004425265935, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.77894709484394, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:19:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14781.194467000205, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5321.007210000062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 219.56516076601233, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.49545777331514, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.35813100028463, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.38897650006402, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.4530258622076, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54371866717419, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.81759093661918, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:12:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.85150928037011, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16121.083815500242, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5097.145164000722, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 275.79642830399223, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.19441409999611, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.28274150044308, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.55316599961225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.05415411786547, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.27375442270062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.22412523832307, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.899791572255083, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 05:05:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2584.5175194999683, + "value": 2582.9528250001204, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.96530007466815, + "value": 121.40620600400871, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.06174400038435, + "value": 84.30327799987936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.208495283912516, + "value": 19.327078396834594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.280756653462735, + "value": 17.348156597758088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 04:05:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5309.328878999622, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7676.953527500018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.55137791065258, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.45015838399922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.64112149991706, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.44363349988907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.58673908159116, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.14807488799022, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.90731126948433, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.86473735310059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-19 04:31:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7728.100629999972, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6375.89961599997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.62900196932907, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.17996672000194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.0928509999485, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.82110750002357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.93319605515613, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.261575918773644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.58694876708374, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.93941055041797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:32:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76756.42494399994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256178.08394700024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57101.352517459985, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241217.3788715066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69993.83625249992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242909.12422749988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.00693326917761, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.78426943271543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.64969922823357, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.42910055791909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:05:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13629.244735499924, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14962.165949000337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 205.90700912799548, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 217.35480540932926, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.40073799972743, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.4372419998399, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.67080802803109, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.31899528070943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.6921246030719, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.0186063754553, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:45:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 255002.6453329999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2412.8427380001085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240243.91990748668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.05806883201392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241767.39791549972, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.62716600008935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.7344433797077, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.967667828441932, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.95666295226302, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.19396913740613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:20:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2445.613765499729, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3648.4311744998195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.16249440534011, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.61817750334254, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.27358949989139, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.64381549989776, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.96014548999124, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.391557942030953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.162734591422716, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.682450065742795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:39:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1941.047039500063, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1931.3982129997385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.00893331999518, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.64826490332962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.485431000230164, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.139810999520705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.129155345630748, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.212292750231466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.12759747485139, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.533745229745174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 03:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6071.857499000089, + "value": 6043.03864350004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.62412981668572, + "value": 121.09042400334602, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.97832849989572, + "value": 81.08035050008766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.80621436769108, + "value": 39.646203415611396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.397060414084116, + "value": 39.31455497377305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:57:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59540.021137000054, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13850.136857500274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37168.83478003866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.4305926473353, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36646.56828200009, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.73287000017444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.34496673627908, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.9987464124662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.2032687169866, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113.73618585393356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-19 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -75130,668 +75130,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-20T21:44:29Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40" }, - "date": 1716276873695, + "date": 1716363120528, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5888.232021000476, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256771.280315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.28707645070001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241967.81838042397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.33425299933151, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243518.75813349965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.37600865038578, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.92201654990878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.240458468117346, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.57899807596435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1851.7477995001173, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15245.16342600009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.63186055335609, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 216.33090558799086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.01871750029386, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.93354750022627, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.64062083609474, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.57023909569018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.908801976978634, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.89090979804146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62561.73622250026, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76459.87004700009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26827.57049859066, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57006.08940392401, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24709.98807849992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69956.17225149977, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 186.2852890606116, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.29421221482295, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.76614333508624, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.44521783417453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:42:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76591.36254999999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16063.256238000577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56769.70654236267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 285.1757610933564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69590.44582899992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 163.56607699981396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.06704888151754, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.20962904427824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.64701087512805, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.77424767327517, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:02:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1948.544970999592, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2590.1457790000677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.7087402533507, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.22204088800088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.53587200021502, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.77577700023176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.09703730208491, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.27610465963713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.17204182207614, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.255476069253277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:28:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15923.402555499706, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3649.4989170000736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245.11491717732738, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.08588893001132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.51965949989244, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.60186949970739, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.06644579933197, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.37970848197325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.87246110308337, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.666738033050542, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 05:06:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2035.2116740000383, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5305.091374999392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.47769093337047, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.49676534935378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.839236000010715, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.33045750001475, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.604968096061906, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.56192385121597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.710429787194647, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.86431237682397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:54:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6043.739634999952, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1938.3338355000888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.0259374199946, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.67521980001038, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.7043535000912, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.12038950031638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.47326180115146, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.079684189198728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.24005488804747, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.16000790993628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:22:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6097.635424999908, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6054.55432500014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.50908731998727, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.47732568000956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.17343300005996, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.02754549999281, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.57012586129777, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.41897263375693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.0636363200751, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.26074101871709, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:46:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59794.577934000015, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1921.9436020002831, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37813.855549205335, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.61242698001236, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37292.644641000035, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.55323299981319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.51843281166308, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.216317727657616, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.77894709484394, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.480939822172344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:36:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5321.007210000062, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1826.2239115001648, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.49545777331514, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.34109191331663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.38897650006402, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.82000150025124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54371866717419, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.62939598497101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.85150928037011, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.92246008794239, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:32:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5097.145164000722, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2410.6342294999195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.19441409999611, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.91471860398451, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.55316599961225, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.25686200012206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.27375442270062, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.08641513766197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.899791572255083, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.237355544863274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:50:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2582.9528250001204, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6061.561180500121, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.40620600400871, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.74932165665511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.30327799987936, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 75.64902700005405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.327078396834594, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.64483831073297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.348156597758088, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.25501033452916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:06:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7676.953527500018, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6371.696383000028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.45015838399922, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.48925968000124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.44363349988907, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.49467699996603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.14807488799022, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.201560438764346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.86473735310059, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.90810849399123, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:28:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6375.89961599997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5821.214592499928, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.17996672000194, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.52209673600494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.82110750002357, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.16225599953032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.261575918773644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.09503225674646, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.93941055041797, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.1659372870581, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:14:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256178.08394700024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62754.508411499046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241217.3788715066, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 27318.900191153323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242909.12422749988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25762.201995500618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.78426943271543, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.7308705063005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.42910055791909, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 196.39547295298112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:21:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14962.165949000337, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2027.2287369994046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 217.35480540932926, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.79010776007392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.4372419998399, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.287139500378544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.31899528070943, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.60009808279639, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.0186063754553, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.687424613845359, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:12:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2412.8427380001085, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5068.800384000497, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.05806883201392, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.11502234658353, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.62716600008935, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.95106050038157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.967667828441932, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.22859554011948, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.19396913740613, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.890091700003964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:40:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3648.4311744998195, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13774.644650499795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.61817750334254, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 206.66789126134367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.64381549989776, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.7057569999597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.391557942030953, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.51795528662116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.682450065742795, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113.96346420574625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-21 04:22:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1931.3982129997385, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6115.162760999965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.64826490332962, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.56159974003337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.139810999520705, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.11591900013991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.212292750231466, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.617765827755456, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.533745229745174, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.101717808082036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 04:00:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6043.03864350004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7772.880707999889, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.09042400334602, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.48812977466272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.08035050008766, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.6518690000048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.646203415611396, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.264786195731034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.31455497377305, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.87179617812733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 02:53:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13850.136857500274, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60296.60475000003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.4305926473353, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38417.961550126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.73287000017444, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38246.879616499995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.9987464124662, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.11643894172259, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 113.73618585393356, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.4101346244016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-21 03:46:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -75807,1355 +75807,1355 @@ window.BENCHMARK_DATA = { "username": "web-flow", "email": "noreply@github.com" }, - "id": "93183d6b21fd1f42fc2a98b93e46fca0c5530b40", - "message": "increase timeouts (#253)\n\nSUMMARY:\r\n* increase \"RELEASE\" workflow timeouts to 12 hours\r\n\r\nTEST PLAN:\r\nwill cherry pick and trigger job manually on release, `v0.3.0`\r\n\r\nCo-authored-by: andy-neuma ", - "timestamp": "2024-05-20T21:44:29Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/93183d6b21fd1f42fc2a98b93e46fca0c5530b40" + "id": "a10b8316b838ebf315aafc4e0429e1399eca3307", + "message": "`requirements-dev.txt` and workflow patches (#255)\n\nSUMMARY:\r\n* update `requirements-dev.txt` to address `urllib` dependency\r\n* update \"build test\" only be reusable\r\n* update \"benchmark\" workflow to disambiguate artifacts\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: Michael Goin \r\nCo-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com>\r\nCo-authored-by: andy-neuma \r\nCo-authored-by: Domenic Barbuzzi ", + "timestamp": "2024-05-22T13:44:43Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/a10b8316b838ebf315aafc4e0429e1399eca3307" }, - "date": 1716363120528, + "date": 1716449679848, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256771.280315, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7689.986569999974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241967.81838042397, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.2369954986707, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243518.75813349965, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.68052250009987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.92201654990878, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.18177578822051, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.57899807596435, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.90111482577333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:18:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15245.16342600009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6355.970685499983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 216.33090558799086, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.16883072000064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.93354750022627, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.46714699993117, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.57023909569018, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.17667501279266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.89090979804146, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.897573278567734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:09:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76459.87004700009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13543.124993999754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57006.08940392401, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 198.0917725726455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69956.17225149977, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.69420799961154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.29421221482295, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.5214924319586, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.44521783417453, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.27711537817672, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:03:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16063.256238000577, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14884.52258549978, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 285.1757610933564, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 215.83828740533318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 163.56607699981396, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.05572449988904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.20962904427824, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.93837186669334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.77424767327517, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.709955622744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 05:03:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2590.1457790000677, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59568.68839799995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.22204088800088, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37536.996530118005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.77577700023176, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36862.34628149998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.27610465963713, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.34058150278607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.255476069253277, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.78903259935451, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 04:02:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3649.4989170000736, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76216.40349400013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.08588893001132, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56605.960194843996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.60186949970739, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69334.1744034999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.37970848197325, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.27457006131601, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.666738033050542, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.15658977573332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:19:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5305.091374999392, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2413.9468949997536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.49676534935378, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.32430701464425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.33045750001475, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.14013299993894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.56192385121597, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.99891797558831, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.86431237682397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.15897576666761, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:29:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1938.3338355000888, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5087.44845199999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.67521980001038, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.80579291998099, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.12038950031638, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.061083001230145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.079684189198728, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.240965857212494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.16000790993628, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.85187867530749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:25:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6054.55432500014, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1943.3179324996672, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.47732568000956, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.02932675337676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.02754549999281, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.14830800022173, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.41897263375693, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.069767070678836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.26074101871709, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.176234212828936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:23:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1921.9436020002831, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6100.955769499933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.61242698001236, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.57615327331648, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.55323299981319, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.58089699979064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.216317727657616, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.58406245663734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.480939822172344, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.07420007437409, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:56:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1826.2239115001648, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5270.538775500427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.34109191331663, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.91739862662266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.82000150025124, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.0050875003144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.62939598497101, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.47199793280612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.92246008794239, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.66351316804006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:30:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2410.6342294999195, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3635.538525999891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.91471860398451, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.14882953669135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.25686200012206, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.81988950008235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.08641513766197, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.17861995539896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.237355544863274, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.656972808143404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:37:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6061.561180500121, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5823.356046999834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.74932165665511, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.88542066930434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 75.64902700005405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.16509849950671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.64483831073297, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.13534165651794, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.25501033452916, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.2948623041938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6371.696383000028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2552.110619999439, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.48925968000124, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.35040164531893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.49467699996603, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.91137899980095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.201560438764346, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.003036375024585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.90810849399123, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.2926273737124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:17:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5821.214592499928, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16172.425026000383, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.52209673600494, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 274.12945982464953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.16225599953032, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.27451699937956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.09503225674646, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.62205870226447, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.1659372870581, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.15480796223878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:56:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62754.508411499046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1921.1439794999023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27318.900191153323, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.49463854331407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25762.201995500618, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.16418050037464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.7308705063005, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.252081158818903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 196.39547295298112, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.528253547893332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:39:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2027.2287369994046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 254997.02183650038, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.79010776007392, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240218.253697654, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.287139500378544, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241757.30261750004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.60009808279639, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.71335484664382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.687424613845359, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.96072925992397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:50:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5068.800384000497, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6035.140677500067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.11502234658353, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.43279252667244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.95106050038157, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.55342100000962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.22859554011948, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.28536140570839, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.890091700003964, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.236489911805975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-22 04:47:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13774.644650499795, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6032.537427500074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 206.66789126134367, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.9273855433418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.7057569999597, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.7403040000263, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.51795528662116, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.52752941719753, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 113.96346420574625, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.282266362386494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 03:43:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6115.162760999965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1822.5031120000494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.56159974003337, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.59179489331775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.11591900013991, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.79737500006377, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.617765827755456, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.556154617036514, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.101717808082036, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.89281882411575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:47:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7772.880707999889, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64642.84791799946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.48812977466272, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 28202.068138578023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.6518690000048, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26515.03411000067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.264786195731034, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 187.5715645663519, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.87179617812733, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 198.60651942622803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:30:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60296.60475000003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2030.1320085000043, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38417.961550126, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.92254640001556, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38246.879616499995, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.536833000056504, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.11643894172259, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.594711202005476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.4101346244016, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.670738943688308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-22 02:38:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Andy Linfoot", - "username": "andy-neuma", - "email": "78757007+andy-neuma@users.noreply.github.com" + "name": "dhuangnm", + "username": "dhuangnm", + "email": "74931910+dhuangnm@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "a10b8316b838ebf315aafc4e0429e1399eca3307", - "message": "`requirements-dev.txt` and workflow patches (#255)\n\nSUMMARY:\r\n* update `requirements-dev.txt` to address `urllib` dependency\r\n* update \"build test\" only be reusable\r\n* update \"benchmark\" workflow to disambiguate artifacts\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: Michael Goin \r\nCo-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com>\r\nCo-authored-by: andy-neuma \r\nCo-authored-by: Domenic Barbuzzi ", - "timestamp": "2024-05-22T13:44:43Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/a10b8316b838ebf315aafc4e0429e1399eca3307" + "id": "a6b94433cc411da4e03724d54c5b158a24cfc6b3", + "message": "update install commands (#264)\n\nUse nm pypi to install.\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm ", + "timestamp": "2024-05-23T22:20:55Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716449679848, + "date": 1716536565926, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7689.986569999974, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53340.341240000045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.2369954986707, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32647.24735656332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.68052250009987, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32484.890060499994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.18177578822051, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.19022086013526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.90111482577333, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.75279759147365, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:32:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6355.970685499983, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56350.74879249987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.16883072000064, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23603.44053233068, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.46714699993117, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 21319.333429500148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.17667501279266, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.05547488093498, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.897573278567734, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.6181682981477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:19:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13543.124993999754, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6328.47620550001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 198.0917725726455, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.63078116000695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.69420799961154, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.90298399999983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.5214924319586, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.021877670575236, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.27711537817672, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.7119793768181, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:45:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14884.52258549978, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1927.5843179998446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 215.83828740533318, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.19803643329699, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.05572449988904, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.83859500001563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.93837186669334, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10.948486448532625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.709955622744, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.07130429856349, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:12:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59568.68839799995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1922.216698000284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37536.996530118005, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.95481403333845, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36862.34628149998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.12456300004487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.34058150278607, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.021855861203537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.78903259935451, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.31126985500981, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:40:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76216.40349400013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10706.843501000094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56605.960194843996, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 189.20717229067245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69334.1744034999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.12494499988316, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.27457006131601, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.11347146428858, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.15658977573332, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.72714163173411, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:05:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2413.9468949997536, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6021.900845499886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.32430701464425, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.95130278667511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.14013299993894, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.55972399979328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.99891797558831, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.442295126045764, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.15897576666761, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.16522131586314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:39:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5087.44845199999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2501.4737395003976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.80579291998099, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.45848303999931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.061083001230145, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.39095849987643, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.240965857212494, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.586016224269795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.85187867530749, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.826070068509377, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:50:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1943.3179324996672, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2018.6891855000795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.02932675337676, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.39448462671376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.14830800022173, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.17358449987296, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.069767070678836, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.47898532661828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.176234212828936, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.611666316849625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:27:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6100.955769499933, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1795.7626519996666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.57615327331648, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.60398087663452, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.58089699979064, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.510263999778545, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.58406245663734, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.359187912905757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.07420007437409, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.727949840223024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:49:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5270.538775500427, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5015.686206999817, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.91739862662266, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.56763556000563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.0050875003144, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.70900599954621, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.47199793280612, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.956494874308596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.66351316804006, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.625132983095348, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:31:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3635.538525999891, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5964.362075999929, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.14882953669135, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.80836461333107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.81988950008235, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.86407200000212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.17861995539896, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.03774788189492, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.656972808143404, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.01274685221, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:21:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5823.356046999834, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 248859.66981499997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.88542066930434, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 234459.43021540332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.16509849950671, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235739.0611275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.13534165651794, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.13166612947958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.2948623041938, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.17691391666763, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:58:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2552.110619999439, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11799.690713000473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.35040164531893, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 200.45587572466198, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.91137899980095, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.61897100021815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.003036375024585, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.01456649864781, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.2926273737124, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.56831145706713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 04:05:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16172.425026000383, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2360.9765269998206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 274.12945982464953, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.06260132800647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.27451699937956, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.12535400034903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.62205870226447, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.382147388704105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.15480796223878, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.60249533650649, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 05:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1921.1439794999023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3616.067349000332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.49463854331407, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.33096500331158, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.16418050037464, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.21500950015616, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.252081158818903, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.055657327705525, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.528253547893332, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.363944819167397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:59:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 254997.02183650038, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5632.193468999503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240218.253697654, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.86905475065336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241757.30261750004, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.12576200084732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.71335484664382, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 51.53382027128091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.96072925992397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 44.77461915881346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:20:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6035.140677500067, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6073.618323000119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.43279252667244, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.26460736663405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.55342100000962, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.37401249984032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.28536140570839, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.37037158497149, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.236489911805975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.94458470579859, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:25:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6032.537427500074, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11876.43002400091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.9273855433418, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.14412551399556, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.7403040000263, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.1723874999443, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.52752941719753, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.47279808382501, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.282266362386494, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.25909209936462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 02:56:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1822.5031120000494, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73406.11455199996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.59179489331775, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54311.549439408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.79737500006377, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66464.88848750005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.556154617036514, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.72266576174421, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.89281882411575, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.966167132927936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:33:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64642.84791799946, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7404.249196000023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 28202.068138578023, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.10822315466854, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26515.03411000067, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.76332000009916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 187.5715645663519, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.44544366717936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 198.60651942622803, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.98981760273222, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-23 04:42:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2030.1320085000043, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5116.336590000174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.92254640001556, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.1112781680058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.536833000056504, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.22934650019306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.594711202005476, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.228384017145665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.670738943688308, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.34638776367882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-23 03:52:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -77176,668 +77176,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716536565926, + "date": 1716536817806, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53340.341240000045, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6036.9134255000745, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32647.24735656332, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.22891422666697, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32484.890060499994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.64084949991502, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.19022086013526, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.29985978654724, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.75279759147365, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.22647533997652, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56350.74879249987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1922.9179699996166, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23603.44053233068, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.29596407001979, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 21319.333429500148, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.72993099996893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.05547488093498, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.254491473531806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.6181682981477, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.581384785427327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:45:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6328.47620550001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5865.428363000319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.63078116000695, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.17474247867843, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.90298399999983, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.99209249900741, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.021877670575236, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.13994388472499, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.7119793768181, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.25050560318028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:18:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1927.5843179998446, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13788.609590000306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.19803643329699, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 200.7562045140021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.83859500001563, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.53075749989875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10.948486448532625, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.21229497936153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.07130429856349, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113.49323806909048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1922.216698000284, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257930.24714800002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.95481403333845, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242762.75297554734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.12456300004487, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244606.07823699957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.021855861203537, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.98077336152181, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.31126985500981, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.57785580277736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:03:31 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10706.843501000094, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6121.938016500053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 189.20717229067245, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.77741779333034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.12494499988316, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.98791049982356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.11347146428858, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.63266728413954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.72714163173411, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.16002097251475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:50:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6021.900845499886, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1841.6702359995725, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.95130278667511, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.74554867333487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.55972399979328, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.50209150023511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.442295126045764, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.661615063284032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.16522131586314, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.941003594092496, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:01:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2501.4737395003976, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2419.35477349989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.45848303999931, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.5059897760002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.39095849987643, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.34635549959057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.586016224269795, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.00136931223196, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.826070068509377, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.246505667705023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:09:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2018.6891855000795, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6034.086707999904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.39448462671376, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.38761330666662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.17358449987296, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.75800400024491, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.47898532661828, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.61820284429493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.611666316849625, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.31828487961084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:57:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1795.7626519996666, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5090.033450500414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.60398087663452, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.7804306066076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.510263999778545, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.67713300046307, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.359187912905757, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.23894181575603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.727949840223024, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.854331450733856, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:37:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5015.686206999817, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76102.08756499992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.56763556000563, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56401.26771139466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.70900599954621, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69115.46357449993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.956494874308596, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.94664253804291, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.625132983095348, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.49061364211835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5964.362075999929, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2039.5816205004849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.80836461333107, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.73425066667066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.86407200000212, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.44177899989154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.03774788189492, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.61755039425222, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.01274685221, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.687218283229393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:30:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 248859.66981499997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60704.40484450001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 234459.43021540332, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38640.03059703801, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235739.0611275, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38752.15475250002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.13166612947958, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.00013161536351, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.17691391666763, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.45309630879403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:24:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11799.690713000473, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15843.160680000437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 200.45587572466198, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 238.37949193667492, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.61897100021815, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 152.5966659992264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.01456649864781, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.5245050964936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.56831145706713, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.85664370317943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2360.9765269998206, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3650.308398499874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.06260132800647, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.73153383998155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.12535400034903, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.141590000414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.382147388704105, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.28558014734185, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15.60249533650649, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.59825115081777, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:43:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3616.067349000332, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61973.708557499776, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.33096500331158, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26749.070237649325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.21500950015616, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25255.849778500306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.055657327705525, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.85499686074058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.363944819167397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 196.2129969969764, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:25:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5632.193468999503, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1943.4454174997882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.86905475065336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.29053529998782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.12576200084732, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.866527499976655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 51.53382027128091, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.08816577639735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 44.77461915881346, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.178469397792009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6073.618323000119, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6370.195135500012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.26460736663405, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.33617884666637, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.37401249984032, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.33415199995352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.37037158497149, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.22914209162545, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.94458470579859, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.92209173912628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:54:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11876.43002400091, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5320.986331498716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.14412551399556, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.93842036265414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.1723874999443, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.00369149961625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.47279808382501, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.66642364158503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.25909209936462, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.79658968764954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:09:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73406.11455199996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7782.100346499988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54311.549439408, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.5081379680014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66464.88848750005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.07041599996683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.72266576174421, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.203279359686796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.966167132927936, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.921139528492176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:10:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7404.249196000023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2569.8311404999004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.10822315466854, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.11115059466101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.76332000009916, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.52654550001171, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.44544366717936, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.055629099922392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.98981760273222, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.188879242395696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:37:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5116.336590000174, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14923.420833999444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.1112781680058, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 220.1605600566727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.22934650019306, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 175.95376700000998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.228384017145665, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.74639971695998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.34638776367882, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.56766392840541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:35:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -77858,668 +77858,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716536817806, + "date": 1716536967175, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6036.9134255000745, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3674.2376610000065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.22891422666697, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.098352476661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.64084949991502, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.25462249994598, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.29985978654724, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.580405419507663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.22647533997652, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.82968114245691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:34:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1922.9179699996166, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6158.242706500005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.29596407001979, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.9525071600201, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.72993099996893, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.32726299994829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.254491473531806, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.84905220986917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.581384785427327, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.341917398678106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:08:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5865.428363000319, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66139.41254149994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.17474247867843, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43122.886156283326, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.99209249900741, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43070.87479899997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.13994388472499, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.30706422659057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.25050560318028, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.03386879880344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:07:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13788.609590000306, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18679.325246499957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 200.7562045140021, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 231.30088457267934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.53075749989875, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.6688005000251, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.21229497936153, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.79304180612957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 113.49323806909048, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.74625207767156, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257930.24714800002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1851.8035160000181, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242762.75297554734, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.9118324999951, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244606.07823699957, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.61253549998946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.98077336152181, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.751144219890206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.57785580277736, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.088412348256844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:29:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6121.938016500053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2647.3707849995662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.77741779333034, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.92175404267255, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.98791049982356, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.58569550020911, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.63266728413954, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.90610722549118, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.16002097251475, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.8848995473422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:58:38 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1841.6702359995725, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1952.816832500048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.74554867333487, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.18456532334434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.50209150023511, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.47611900016636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.661615063284032, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.603912201429557, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.941003594092496, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.803728676648895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:42:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2419.35477349989, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19390.905797000414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.5059897760002, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1893.6356199340166, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.34635549959057, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 314.27889649967256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.00136931223196, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.7659810246803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.246505667705023, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.45676941805075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:48:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6034.086707999904, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5496.9637855001565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.38761330666662, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.92184810803397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.75800400024491, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.60201850078738, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.61820284429493, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.53539019691746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.31828487961084, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.7112078901483, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:05:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5090.033450500414, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5146.833499498825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.7804306066076, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.67963537338558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.67713300046307, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.43261250085197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.23894181575603, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.51844826831605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.854331450733856, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.104311450581175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:59:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76102.08756499992, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6061.453177999965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56401.26771139466, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.60437605000484, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69115.46357449993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.86141350008802, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.94664253804291, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.935846385705155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.49061364211835, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.556099753483785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:14:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2039.5816205004849, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1959.2217435001658, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.73425066667066, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.03675333999126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.44177899989154, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.935721999881935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.61755039425222, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.231196407969382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.687218283229393, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.280820497209362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:01:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60704.40484450001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258816.12126400022, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38640.03059703801, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244034.62704731268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38752.15475250002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245447.883156, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.00013161536351, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.6744499885317, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.45309630879403, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.75007709108667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:49:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15843.160680000437, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6111.26218849995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 238.37949193667492, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.27864344000031, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 152.5966659992264, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.03411599992705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.5245050964936, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.89042627643755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.85664370317943, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.66110326928299, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:14:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3650.308398499874, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70477.60344899961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.73153383998155, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31452.343251931336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.141590000414, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30347.126394500265, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.28558014734185, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.51186562524066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.59825115081777, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.3971678225958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:30:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61973.708557499776, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16623.214362499766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26749.070237649325, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 219.05289470665957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25255.849778500306, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.0892044998218, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.85499686074058, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.97556747857035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 196.2129969969764, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.10401456193338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:50:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1943.4454174997882, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2505.5321414997707, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.29053529998782, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.93201387466252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.866527499976655, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.59506499987401, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.08816577639735, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.742624648202092, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.178469397792009, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.925428114448234, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:36:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6370.195135500012, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6096.751922499607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.33617884666637, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.48616470934212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.33415199995352, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.64066650030145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.22914209162545, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.061820024085044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.92209173912628, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.02706380161126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:28:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5320.986331498716, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2048.1824245002827, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.93842036265414, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.24031411338729, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.00369149961625, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.830491999666265, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.66642364158503, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.696145043637065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.79658968764954, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.771937668840932, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7782.100346499988, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79057.19572249995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.5081379680014, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59007.21182767066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.07041599996683, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72297.3000075001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.203279359686796, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.75301193806251, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.921139528492176, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.9750591477635, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:41:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2569.8311404999004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7990.959115999999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.11115059466101, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.34963135199723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.52654550001171, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.77123749994735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.055629099922392, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.10565390062231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.188879242395696, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.683953754812755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:14:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14923.420833999444, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6405.790125000067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 220.1605600566727, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.3461874133312, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 175.95376700000998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72.50817299996015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.74639971695998, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.402849202360066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.56766392840541, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.10504541279752, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:21:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -78540,668 +78540,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716536967175, + "date": 1716622601693, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3674.2376610000065, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6031.264369499922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.098352476661, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.44546363666662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.25462249994598, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.30550150001181, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.580405419507663, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.3387351725222, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.82968114245691, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.18327821422785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:33:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6158.242706500005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6109.72768150009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.9525071600201, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.34567401334182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.32726299994829, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.67937049996908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.84905220986917, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.591272418168025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.341917398678106, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.09092174712987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:00:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66139.41254149994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256462.49853600012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 43122.886156283326, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242283.18882904603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 43070.87479899997, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243359.83851049969, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.30706422659057, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.40998149569158, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.03386879880344, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.80840559486172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:51:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18679.325246499957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6056.822688500006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 231.30088457267934, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.36565666999104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.6688005000251, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.84811750011067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.79304180612957, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.67301465337826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.74625207767156, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.34406697679638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:24:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1851.8035160000181, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77362.13200599991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.9118324999951, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57560.854989348, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.61253549998946, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70575.79673300005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.751144219890206, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.98880239881521, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.088412348256844, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.72136673644505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:45:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2647.3707849995662, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1829.048375500406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.92175404267255, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.66710255002788, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.58569550020911, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.33029700013503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.90610722549118, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.626857479513577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.8848995473422, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.911935682773704, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:17:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1952.816832500048, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62127.03208150015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.18456532334434, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26797.982637383324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.47611900016636, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25023.304212999392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.603912201429557, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.5416417471531, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.803728676648895, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.664749041314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:11:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19390.905797000414, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5090.012251000189, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1893.6356199340166, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.4689235599435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 314.27889649967256, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.12264149871771, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.7659810246803, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.19461875864989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.45676941805075, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.825566204415544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:18:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5496.9637855001565, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59761.77965350007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.92184810803397, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37601.94518529133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.60201850078738, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37383.080577499866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.53539019691746, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.36235305567693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.7112078901483, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.9264288851969, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:43:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5146.833499498825, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1940.5441085004895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.67963537338558, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.5169629466521, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.43261250085197, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.2674049995112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.51844826831605, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.064534162064737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.104311450581175, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.140311892694836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6061.453177999965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2411.4901895000003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.60437605000484, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.89777418667168, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.86141350008802, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.16683850034678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.935846385705155, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.954706164294176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.556099753483785, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.164560357492107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:08:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1959.2217435001658, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13703.912494499946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.03675333999126, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 200.798709573323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.935721999881935, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.21959099978267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.231196407969382, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.51241611053686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.280820497209362, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.25538014624625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:38:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258816.12126400022, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2582.2408904996337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244034.62704731268, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.13682931333702, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245447.883156, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.3190564996803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.6744499885317, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.15072832813987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.75007709108667, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.324471008133802, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:31:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6111.26218849995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7693.3109779999995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.27864344000031, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.85766106533146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.03411599992705, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.95628700000816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.89042627643755, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.20704889317462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.66110326928299, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.742528299871175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:36:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70477.60344899961, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14864.066802000252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31452.343251931336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.90278705802183, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30347.126394500265, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.76611750028314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.51186562524066, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.36697423094968, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.3971678225958, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.41396716324671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 04:54:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16623.214362499766, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1920.3525215002628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 219.05289470665957, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.54855116998927, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.0892044998218, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.15634800016778, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.97556747857035, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.170856558273053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.10401456193338, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.425277037628705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:58:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2505.5321414997707, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6348.104188499974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.93201387466252, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.28916381333177, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.59506499987401, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.46765200007121, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.742624648202092, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.17308675822627, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.925428114448234, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.84379338958343, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:51:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6096.751922499607, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2030.2066539998123, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.48616470934212, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.28930491332237, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.64066650030145, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.891775000091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.061820024085044, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.581854473220956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.02706380161126, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.685599667260387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-24 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2048.1824245002827, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3643.7305149993335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.24031411338729, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.4523402499896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.830491999666265, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.58675099854008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.696145043637065, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.319572147250838, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.771937668840932, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.63598660201902, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 04:04:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79057.19572249995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5920.897044999947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59007.21182767066, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.08681493996846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72297.3000075001, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.7725160004702, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.75301193806251, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.34940247664822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.9750591477635, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.33074625403821, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 03:16:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7990.959115999999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5312.3135955002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.34963135199723, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.42202690800573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.77123749994735, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 175.19969800014223, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.10565390062231, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.6223924948184, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.683953754812755, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.70505213202464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6405.790125000067, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15973.229238000386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.3461874133312, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 248.27122630997718, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72.50817299996015, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.4686200003489, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.402849202360066, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.39758932653328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.10504541279752, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.34830056155036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-24 02:29:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -79222,668 +79222,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716622601693, + "date": 1716622960721, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6031.264369499922, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2504.7344915001304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.44546363666662, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.13965160133496, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.30550150001181, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.43372849983643, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.3387351725222, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.53969207923632, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.18327821422785, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.685054020185998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:27:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6109.72768150009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2361.4371645003303, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.34567401334182, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.3273589413499, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.67937049996908, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.56191150040104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.591272418168025, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.418019590081464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.09092174712987, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.635949923512728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:51:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256462.49853600012, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3611.443037999379, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242283.18882904603, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.57639907332668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243359.83851049969, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.61489900109154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.40998149569158, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.954262386158387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.80840559486172, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.42831613275324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:22:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6056.822688500006, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7404.824202499981, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.36565666999104, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.15427302533803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.84811750011067, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.7996835000058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.67301465337826, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.452128171401796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.34406697679638, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.96808436744122, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:59:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77362.13200599991, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5968.643334000035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57560.854989348, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.07519155334617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70575.79673300005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.55286099993009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.98880239881521, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.01037937361815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.72136673644505, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.04350030887357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:07:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1829.048375500406, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5147.891583999808, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.66710255002788, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.405902318671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.33029700013503, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.71799599964288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.626857479513577, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.26570201250069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.911935682773704, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.395516495092856, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:35:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62127.03208150015, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12136.588689000746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26797.982637383324, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 161.92133887665963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25023.304212999392, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.32198549970053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.5416417471531, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.62503236256481, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.664749041314, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.61493116307709, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:43:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5090.012251000189, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 247683.95640800032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.4689235599435, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 234411.032387272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.12264149871771, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 234740.4976380003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.19461875864989, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.25225341825055, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.825566204415544, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.13249446891975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:52:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59761.77965350007, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6076.790150499846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37601.94518529133, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.47128707999885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37383.080577499866, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.7422000000879, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.36235305567693, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.4251162420238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.9264288851969, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.91772018699045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:42:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1940.5441085004895, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10745.297296500212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.5169629466521, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 188.61554564599464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.2674049995112, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 140.71318149990475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.064534162064737, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.44202160884846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.140311892694836, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.05510422120436, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:29:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2411.4901895000003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73398.27332400001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.89777418667168, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54209.12408943067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.16683850034678, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66537.2725405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.954706164294176, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.09792893281684, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.164560357492107, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.21518476361895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:41:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13703.912494499946, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5761.7960370007495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 200.798709573323, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.40179928800596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.21959099978267, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.17287200044666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.51241611053686, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.716252278091964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.25538014624625, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.087029270857535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:47:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2582.2408904996337, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5019.673364499795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.13682931333702, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.60357411326791, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.3190564996803, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.26066899923899, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.15072832813987, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.93568086410549, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.324471008133802, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.609382320239355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:07:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7693.3109779999995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2012.1260879996044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.85766106533146, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.32166984004411, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.95628700000816, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.357617999939976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.20704889317462, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.458834435932188, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.742528299871175, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.56015452206276, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:34:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14864.066802000252, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56270.08645049955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.90278705802183, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23728.259643247988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.76611750028314, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 21910.15640349997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.36697423094968, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.0254877996741, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.41396716324671, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.79463254781044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1920.3525215002628, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53237.17141300017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.54855116998927, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32490.671532157336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.15634800016778, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31882.60320900008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.170856558273053, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.03625464512164, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.425277037628705, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.38718661263374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:01:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6348.104188499974, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6329.448286499996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.28916381333177, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.97678893333416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.46765200007121, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.86343199997691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.17308675822627, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.05337502494129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.84379338958343, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.76410726584316, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:16:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2030.2066539998123, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6007.674074999841, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.28930491332237, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.78175235666428, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.891775000091, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.45285550024528, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.581854473220956, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.34144362994333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.685599667260387, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.9417495712645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:54:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3643.7305149993335, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1925.279082999623, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.4523402499896, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.25064233332039, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.58675099854008, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.08967000031771, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.319572147250838, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10.965364766620553, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.63598660201902, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.035743212651537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:23:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5920.897044999947, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1800.6607509996684, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.08681493996846, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.66373576337112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.7725160004702, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.42086900004506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.34940247664822, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.420129707509071, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.33074625403821, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.642768863311378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:00:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5312.3135955002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11904.731997000454, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.42202690800573, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 201.0525258406712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 175.19969800014223, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 151.09472750009445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.6223924948184, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.91106637531124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.70505213202464, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.19579719517606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:33:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15973.229238000386, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1896.5291289996458, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 248.27122630997718, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.49556736662696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.4686200003489, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.09356399979515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.39758932653328, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.103081050226754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.34830056155036, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.383000956841924, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:07:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -79904,668 +79904,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716622960721, + "date": 1716622967578, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2504.7344915001304, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6087.053470500905, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.13965160133496, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.27033984530863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.43372849983643, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.28590799971425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.53969207923632, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.27311377605129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.685054020185998, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.228761984107344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2361.4371645003303, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19589.319870999134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.3273589413499, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1984.5874082339953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.56191150040104, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 309.7395859995231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.418019590081464, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.5124573048829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15.635949923512728, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.74307465226795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:44:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3611.443037999379, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1867.0875650004746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.57639907332668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.74675671998375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.61489900109154, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.528012999886414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.954262386158387, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.87366424046731, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.42831613275324, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.188751277435541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:26:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7404.824202499981, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66141.291937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.15427302533803, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43037.73437554266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.7996835000058, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42584.142079000005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.452128171401796, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.05397899932792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.96808436744122, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.00951704392612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:38:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5968.643334000035, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18228.306296000028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.07519155334617, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 236.82018156332683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.55286099993009, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 194.85675999976593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.01037937361815, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 150.58851241833838, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.04350030887357, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.70345973423565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:32:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5147.891583999808, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5145.437896000658, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.405902318671, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.9104367600424, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.71799599964288, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.93177600077615, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.26570201250069, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.52623326125513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.395516495092856, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.122674256257213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:36:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12136.588689000746, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70695.30652600042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 161.92133887665963, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31733.29802215001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.32198549970053, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30824.094797000726, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.62503236256481, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.99178777409415, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.61493116307709, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.85377518748848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:09:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 247683.95640800032, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5441.562446500029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 234411.032387272, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.71218506135966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 234740.4976380003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.88336850035557, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.25225341825055, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.32396189113019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.13249446891975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.381626289477126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:26:08 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6076.790150499846, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3683.198848999382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.47128707999885, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.74309152326168, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.7422000000879, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.91837800123903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.4251162420238, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.700858438495132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.91772018699045, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.94484287721668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:55:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10745.297296500212, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6106.959968999945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 188.61554564599464, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.64483668669224, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 140.71318149990475, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.6377919998722, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.44202160884846, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.181763937938484, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.05510422120436, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.75833014081126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73398.27332400001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16295.719529999587, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54209.12408943067, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 215.7550698966558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66537.2725405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 175.40716299981796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.09792893281684, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.71233496425143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.21518476361895, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 137.6976882858219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:11:43 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5761.7960370007495, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2501.266753499749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.40179928800596, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.88623574000788, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.17287200044666, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.03689400054282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.716252278091964, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.54806090135829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.087029270857535, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.701945583532023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:03:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5019.673364499795, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1954.6669180003846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.60357411326791, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.90659437666909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.26066899923899, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.649019000673434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.93568086410549, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.496277096143636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.609382320239355, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.771188410709488, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:54:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2012.1260879996044, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2053.8726159998077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.32166984004411, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.95088673997209, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.357617999939976, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.46892750011466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.458834435932188, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.762995127248177, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.56015452206276, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.824517697848322, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:00 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56270.08645049955, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2641.5741414998593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23728.259643247988, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.56773874400096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 21910.15640349997, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.30907450035374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.0254877996741, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.862708211385034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.79463254781044, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.8965723674272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:46:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53237.17141300017, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7975.0274250000075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32490.671532157336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.20777703200505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31882.60320900008, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.1412024999754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.03625464512164, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.22152148283052, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.38718661263374, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.85854985409576, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:46:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6329.448286499996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6414.791728000011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.97678893333416, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.42439213332712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.86343199997691, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.28911649994052, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.05337502494129, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.492517040696626, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.76410726584316, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.22096165468236, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:25:52 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6007.674074999841, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6151.79374000013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.78175235666428, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.00769699332523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.45285550024528, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.24624499997844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.34144362994333, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.81058708757571, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.9417495712645, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.314714704806526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:03:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1925.279082999623, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1960.4870684997877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.25064233332039, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.26428046660537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.08967000031771, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.91496050002752, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10.965364766620553, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.222392342401225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.035743212651537, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.286118063279309, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1800.6607509996684, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259450.7627295002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.66373576337112, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245340.87704937998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.42086900004506, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246088.97628649994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.420129707509071, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.7196112040547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.642768863311378, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.9213712702126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11904.731997000454, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79124.52998549998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 201.0525258406712, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59467.74121747067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 151.09472750009445, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72249.8947534998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.91106637531124, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.52950054240793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.19579719517606, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.22939064299659, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:16:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1896.5291289996458, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6101.417414000025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.49556736662696, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.19921564999822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.09356399979515, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.65352849998453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.103081050226754, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.80125146138109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.383000956841924, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.610102991416355, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -80586,668 +80586,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716622967578, + "date": 1716708770539, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6087.053470500905, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13719.834091999473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.27033984530863, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 199.3323442593355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.28590799971425, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 161.42437600001358, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.27311377605129, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.49689771698549, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.228761984107344, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.18992244145717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:04:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19589.319870999134, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5296.767851999903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1984.5874082339953, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.53457311204207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 309.7395859995231, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.78857699946093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.5124573048829, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.458506446936795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.74307465226795, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.67174997975881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 05:12:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1867.0875650004746, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2032.2572195000248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.74675671998375, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.81863097331734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.528012999886414, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.790129000313755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.87366424046731, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.582463725070378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.188751277435541, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.698387482363728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:38:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66141.291937, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2575.869489500292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 43037.73437554266, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.25202191999172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42584.142079000005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.06724200015015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.05397899932792, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.216379730991278, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.00951704392612, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.215824231136967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:45:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18228.306296000028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15963.914889499392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 236.82018156332683, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 252.89774244331053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 194.85675999976593, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.27316349987086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 150.58851241833838, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.5767083485674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.70345973423565, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 134.35014069593458, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:17:30 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5145.437896000658, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7744.966965999993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.9104367600424, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.67808693733514, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.93177600077615, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.4804339998782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.52623326125513, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.31995415834249, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.122674256257213, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.84771027160454, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:56:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70695.30652600042, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5091.644082499442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31733.29802215001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.94019860650344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30824.094797000726, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.469466500057024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.99178777409415, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.180142753186054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.85377518748848, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.832656555818044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:48:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5441.562446500029, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14898.216261499783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.71218506135966, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 216.7655932633364, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.88336850035557, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 168.2440224999482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.32396189113019, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.29937097038504, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.381626289477126, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.60961795147601, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:37:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3683.198848999382, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59762.60505950006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.74309152326168, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37824.285817384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.91837800123903, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37138.776646, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.700858438495132, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.65028427105449, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.94484287721668, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.05101871480682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-25 04:27:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6106.959968999945, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6352.270934000046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.64483668669224, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.53123650666824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.6377919998722, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.64712799999961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.181763937938484, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.163775880896324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.75833014081126, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.864718307407074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:02:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16295.719529999587, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63241.37254749894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 215.7550698966558, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 27463.514011721312, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 175.40716299981796, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26067.371138500675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.71233496425143, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.51776810311102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 137.6976882858219, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.21703349379666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:51:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2501.266753499749, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1919.3635840001662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.88623574000788, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.73241906336382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.03689400054282, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.73461299961491, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.54806090135829, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.227132877671762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.701945583532023, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.546944922548871, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1954.6669180003846, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77040.78036249985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.90659437666909, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57015.35536195602, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.649019000673434, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70062.93960500011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.496277096143636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.87456157393004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.771188410709488, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.98551638928194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:04:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2053.8726159998077, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3644.370075500319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.95088673997209, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.1177609900169, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.46892750011466, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.1996570000374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.762995127248177, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.16515333873722, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.824517697848322, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.60205459914785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:58:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2641.5741414998593, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5880.626533000395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.56773874400096, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.37698160937725, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.30907450035374, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.13219200116873, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.862708211385034, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.17952147752356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.8965723674272, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.375887840359105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 04:10:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7975.0274250000075, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256106.76833649995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.20777703200505, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241175.34822180797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.1412024999754, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242843.23037950025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.22152148283052, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.7800826563357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.85854985409576, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.41403563661216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:37:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6414.791728000011, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6005.199937999919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.42439213332712, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.69117487666374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.28911649994052, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.98021049999716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.492517040696626, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.394355428637475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.22096165468236, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.33758301703981, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:23:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6151.79374000013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6105.812039000057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.00769699332523, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.84896812001048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.24624499997844, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.20544299994435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.81058708757571, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.54418925176356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.314714704806526, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.03690141021941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:54:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1960.4870684997877, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6035.864570500053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.26428046660537, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.6156219099712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.91496050002752, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.08461649996934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.222392342401225, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.57377718535252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.286118063279309, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.19125505428643, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:32:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259450.7627295002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2404.0219759999673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245340.87704937998, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.56893722801397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246088.97628649994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.24620750004397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.7196112040547, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.934681919266144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.9213712702126, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.157597388916138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:25:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79124.52998549998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1937.9353819999778, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59467.74121747067, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.05925926001146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72249.8947534998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.29892349990405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.52950054240793, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.060224252329409, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.22939064299659, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.175544986865699, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 03:10:40 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6101.417414000025, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1853.6365575000673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.19921564999822, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.67919123664494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.65352849998453, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.12286549983037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.80125146138109, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.59777746379352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.610102991416355, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.84972288918397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-25 02:30:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -81268,668 +81268,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716708770539, + "date": 1716796233577, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13719.834091999473, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6364.633738500004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 199.3323442593355, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.9386039533274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 161.42437600001358, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.67654700006187, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.49689771698549, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.20073086192139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.18992244145717, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.898053901060116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5296.767851999903, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6081.8272660001185, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.53457311204207, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.14610562666105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.78857699946093, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.3581659999727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.458506446936795, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.76893995195344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.67174997975881, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.399661849383165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:31:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2032.2572195000248, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1917.0592025002406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.81863097331734, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.56634040335362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.790129000313755, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.77325799974642, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.582463725070378, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.22545734704069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.698387482363728, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.533952667372015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:52:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2575.869489500292, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7684.304952500042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.25202191999172, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.3382852573356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.06724200015015, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.05097249997743, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.216379730991278, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.028611033938596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.215824231136967, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.690790700825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:05:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15963.914889499392, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6061.761682999986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 252.89774244331053, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.2390800299907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.27316349987086, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.66533649993835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.5767083485674, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.43393309823268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 134.35014069593458, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.259457282322444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 05:05:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7744.966965999993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6107.716720999861, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.67808693733514, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.38538521333551, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.4804339998782, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.93042649978997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.31995415834249, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.5905017773301, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.84771027160454, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.097509785793804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:32:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5091.644082499442, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2031.022674000269, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.94019860650344, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.5421797400292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.469466500057024, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.857552000488795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.180142753186054, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.58746713869253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.832656555818044, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.640850055882305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:49:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14898.216261499783, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5848.073216000557, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 216.7655932633364, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.49546459466849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 168.2440224999482, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.30563050014462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.29937097038504, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53.142382924865366, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.60961795147601, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.474124152207025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 04:11:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59762.60505950006, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 255900.11857799982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37824.285817384, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 240964.17156914668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37138.776646, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242644.34600000049, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.65028427105449, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.83144516244961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.05101871480682, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.3410904844147, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:40:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6352.270934000046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1823.9216125002713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.53123650666824, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.40534543998365, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.64712799999961, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.77597750031782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.163775880896324, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.526363259015222, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.864718307407074, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.851952907261081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:19:06 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63241.37254749894, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59915.679922999974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27463.514011721312, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37919.86216178468, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 26067.371138500675, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37338.46950349994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.51776810311102, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.80755902200929, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.21703349379666, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.77823988364473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:41:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1919.3635840001662, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1941.0032979999414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.73241906336382, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.25834224004575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.73461299961491, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.81687200025408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.227132877671762, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.103759690539231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.546944922548871, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.141562020117878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:58:59 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77040.78036249985, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2574.1813324998475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57015.35536195602, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.24578948798566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70062.93960500011, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.40900949972274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.87456157393004, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.153861356958195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.98551638928194, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.232438050034375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:05:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3644.370075500319, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76608.26362750004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.1177609900169, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56964.278056652016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.1996570000374, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69819.15318949995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.16515333873722, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.88916489565437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.60205459914785, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.55699893438523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:21:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5880.626533000395, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3631.3388734997716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.37698160937725, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.52117958994253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.13219200116873, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.93760449923138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.17952147752356, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.313517914314907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.375887840359105, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.61104615833451, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-26 04:57:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256106.76833649995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16183.035134999955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241175.34822180797, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 276.1944895666469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242843.23037950025, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 161.85223650063563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.7800826563357, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.7125780492183, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.41403563661216, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.00812021365124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6005.199937999919, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14907.566597499681, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.69117487666374, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 214.5425809306859, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.98021049999716, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.90863349994834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.394355428637475, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.74411065262855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.33758301703981, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.70612291771327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:25:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6105.812039000057, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5096.1581319998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.84896812001048, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.35703474664479, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.20544299994435, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.95373600014864, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.54418925176356, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.222790376713746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.03690141021941, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.797951826792268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:49:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6035.864570500053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13693.951621999531, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.6156219099712, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.06395390866965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.08461649996934, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.00548500005607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.57377718535252, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.6035869178744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.19125505428643, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.502632367524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 02:56:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2404.0219759999673, + "value": 2423.249550500259, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.56893722801397, + "value": 115.57257120667418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.24620750004397, + "value": 80.09124349973717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.934681919266144, + "value": 18.016155885708706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.157597388916138, + "value": 16.201786527420072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:38:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1937.9353819999778, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62925.006167501124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.05925926001146, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 27203.82154675731, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.29892349990405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25508.403936501054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.060224252329409, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.4872685244404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.175544986865699, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 196.73251961813324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:26:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1853.6365575000673, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5266.754461499659, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.67919123664494, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.3137356586764, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.12286549983037, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.07800699998188, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.59777746379352, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.42113770176542, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.84972288918397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.674829500863154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-26 03:32:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -81950,668 +81950,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716796233577, + "date": 1716796250867, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6364.633738500004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73557.1878830001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.9386039533274, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54274.971237412, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.67654700006187, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66496.79136700001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.20073086192139, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.08082973808818, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.898053901060116, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.16252433917934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:13:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6081.8272660001185, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 251570.10278099982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.14610562666105, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 237403.6621778293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.3581659999727, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 238762.90595699992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.76893995195344, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.15832508481148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.399661849383165, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.70448818083523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:57:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1917.0592025002406, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55312.02972349911, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.56634040335362, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23317.65573052867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.77325799974642, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 21930.38267600059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.22545734704069, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 179.27011979201416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.533952667372015, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 190.4971275555548, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7684.304952500042, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10766.853013500167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.3382852573356, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.8137156466728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.05097249997743, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.7612689998059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.028611033938596, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.71945132309382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.690790700825, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.43081593084692, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:33:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6061.761682999986, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1806.416813999931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.2390800299907, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.19676065001597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.66533649993835, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.85134750021825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.43393309823268, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.474161891497117, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.259457282322444, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.706609286777685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:26:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6107.716720999861, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1926.0076245000164, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.38538521333551, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.15665756669962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.93042649978997, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.7329220002357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.5905017773301, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10.959522930240288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.097509785793804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.039550408273529, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2031.022674000269, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 53648.168086000165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.5421797400292, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32871.818626446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.857552000488795, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32546.4946269999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.58746713869253, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.3608748026115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.640850055882305, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.57838180275058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5848.073216000557, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7398.480855999992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.49546459466849, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.49252890133266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.30563050014462, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.28433100007896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53.142382924865366, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.40085130142113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 46.474124152207025, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.041959403026794, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:59:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 255900.11857799982, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6325.567019999994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 240964.17156914668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.07393185333513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242644.34600000049, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.99479499997096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.83144516244961, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.99296529157036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.3410904844147, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.6789553645544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:20:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1823.9216125002713, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5637.292696499571, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.40534543998365, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.31337282798145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.77597750031782, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.89296299983107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.526363259015222, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 51.520604634899385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.851952907261081, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 44.854367200206454, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:33:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59915.679922999974, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5980.5163245000585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37919.86216178468, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.54726676999069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37338.46950349994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.09528350009532, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.80755902200929, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.12558342406395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.77823988364473, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.9765445679687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:41:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1941.0032979999414, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3618.1140245007555, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.25834224004575, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141.82623874327874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.81687200025408, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.56672199985405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.103759690539231, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.139670020996387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.141562020117878, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.46367132795628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2574.1813324998475, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12188.274419499066, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.24578948798566, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 161.0161329326705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.40900949972274, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.06120350048877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.153861356958195, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.9521467868849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.232438050034375, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.51454962550012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76608.26362750004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1903.7392800000816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56964.278056652016, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.64907631331091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69819.15318949995, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.35489200030497, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.88916489565437, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.068286042205282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.55699893438523, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.353648010151852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:06:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3631.3388734997716, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5171.4912204997745, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.52117958994253, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.80138122403878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.93760449923138, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.40880849981477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.313517914314907, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.301702569622094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.61104615833451, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.28060892167015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:22:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16183.035134999955, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2484.973353499754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 276.1944895666469, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.97007295466271, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 161.85223650063563, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.28220599969427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.7125780492183, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.468503229402156, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.00812021365124, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.58658233498964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14907.566597499681, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11867.508392499985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 214.5425809306859, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.41663807199137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.90863349994834, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 150.46568049956477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.74411065262855, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.45414866305231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.70612291771327, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.95484754743093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5096.1581319998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5143.1367664999925, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.35703474664479, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.24412510669451, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.95373600014864, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.25308599986602, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.222790376713746, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.44000860144758, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.797951826792268, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.029967242201604, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:50:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13693.951621999531, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2017.1134680003888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.06395390866965, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.84266767330823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.00548500005607, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.14306449992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.6035869178744, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.477915355710886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.502632367524, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.574823246872853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2423.249550500259, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2402.1529614997235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.57257120667418, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.60082032532955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.09124349973717, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.58655049994923, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.016155885708706, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.448930916745994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.201786527420072, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.705658343132942, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:39:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62925.006167501124, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6083.561831499992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 27203.82154675731, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.6075839333292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25508.403936501054, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.40378549993875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.4872685244404, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.45431691825654, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 196.73251961813324, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.949767179674346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:42:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5266.754461499659, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6001.731613000175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.3137356586764, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.3772720366763, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.07800699998188, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.91565199997785, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.42113770176542, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.296355154165305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.674829500863154, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.158321523451804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.10.12 (main, May 10 2024, 13:42:25) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:32:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -82632,668 +82632,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716796250867, + "date": 1716796264101, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73557.1878830001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6090.712625499691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54274.971237412, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.75989160935812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66496.79136700001, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.22087349917274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.08082973808818, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.08633192431127, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.16252433917934, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.06244881207504, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:09:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 251570.10278099982, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2501.6659124999023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 237403.6621778293, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.83828453599442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 238762.90595699992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.57532300012826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.15832508481148, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.668762762910536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.70448818083523, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.743856165047696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:23:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55312.02972349911, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1971.930427499501, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23317.65573052867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.11258741336253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 21930.38267600059, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.800313499654294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 179.27011979201416, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.239623345137028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 190.4971275555548, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.338959846483151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10766.853013500167, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2650.326251000479, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.8137156466728, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.98484990135087, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.7612689998059, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.89696350009035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.71945132309382, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.95477410125936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.43081593084692, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.90629757986169, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:49:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1806.416813999931, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1870.622367499891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.19676065001597, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.77259321667832, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.85134750021825, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.49170350004351, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.474161891497117, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.819426943181075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.706609286777685, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.086471520709258, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:36:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1926.0076245000164, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16360.486754500016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.15665756669962, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 214.87865147801796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.7329220002357, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.61797999981354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 10.959522930240288, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.23870575233246, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.039550408273529, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 139.48319644772516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:30:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 53648.168086000165, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79396.0280839999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32871.818626446, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59363.842118681336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32546.4946269999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72635.35564699987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.3608748026115, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.99403910563898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.57838180275058, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.204830952399, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:44:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7398.480855999992, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3683.4174590003386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.49252890133266, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.18027123327434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.28433100007896, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.91020449968346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.40085130142113, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.714879681573194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.041959403026794, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.938975221457362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:36:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6325.567019999994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5144.204072500543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.07393185333513, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.63029272668307, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.99479499997096, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.391946499607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.99296529157036, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.518711974408774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.6789553645544, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.08318313158899, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:23:53 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5637.292696499571, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19198.749321500145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.31337282798145, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1733.9371251993132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.89296299983107, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 288.0548259990974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 51.520604634899385, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.1794857617087, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 44.854367200206454, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.9221872172329, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:00:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5980.5163245000585, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6406.453824000038, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.54726676999069, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.01004096666732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.09528350009532, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.89030799997909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.12558342406395, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.41407391325501, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.9765445679687, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.153699872384635, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:30:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3618.1140245007555, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18101.753664499938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 141.82623874327874, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.79141934532768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.56672199985405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.4439295003467, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.139670020996387, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 150.3248974939885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.46367132795628, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.98063748205632, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:24:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12188.274419499066, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6116.125251000085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 161.0161329326705, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.95577656666651, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.06120350048877, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.52334149983653, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.9521467868849, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.03362274360493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.51454962550012, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.68617036866701, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:07:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1903.7392800000816, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71091.10926299945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.64907631331091, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32103.595689588667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.35489200030497, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31386.506036500577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.068286042205282, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.23542843331893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.353648010151852, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.83707845247366, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:02:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5171.4912204997745, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7972.799128499901, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.80138122403878, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.74704809600038, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.40880849981477, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.2453224999217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.301702569622094, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.0586387482135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.28060892167015, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.609164657590256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:34:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2484.973353499754, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64822.7314310002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.97007295466271, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42262.49112645533, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.28220599969427, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42000.16301999995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.468503229402156, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.80040609375101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.58658233498964, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.00390717133526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:08:19 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11867.508392499985, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6158.610826499853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 197.41663807199137, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.86893211331275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 150.46568049956477, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.60312150006575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.45414866305231, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.851245836175835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.95484754743093, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.351281267023836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:14:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5143.1367664999925, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1960.810206000133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.24412510669451, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.944098916656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.25308599986602, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.437391000457865, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.44000860144758, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.507841379162667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.029967242201604, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.779278380467078, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:52:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2017.1134680003888, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6102.325008500088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.84266767330823, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.6733946866619, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.14306449992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.94268999993892, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.477915355710886, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.903677714071506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.574823246872853, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.694808280519105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:56:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2402.1529614997235, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2055.765110500033, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.60082032532955, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.66302794668566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.58655049994923, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.71838249956272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.448930916745994, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.723184369227475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15.705658343132942, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.82769511533102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:42:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6083.561831499992, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 260015.09256049985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.6075839333292, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245449.95221155128, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.40378549993875, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246880.73067000005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.45431691825654, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.5629141692095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.949767179674346, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.7125993259444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:53:31 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6001.731613000175, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5494.843467499777, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.3772720366763, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.25016152003082, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.91565199997785, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.3125165003221, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.296355154165305, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.02541022551337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.158321523451804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.13659364626864, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.11.4 (main, May 10 2024, 13:52:50) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:00:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -83314,1350 +83314,1350 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-23T22:20:55Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" }, - "date": 1716796264101, + "date": 1716884889708, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6090.712625499691, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16201.605522999671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.75989160935812, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 211.75247188932855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.22087349917274, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.3825245000844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.08633192431127, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.57625620982662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.06244881207504, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 138.15701727392738, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:06:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2501.6659124999023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19580.348333499387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.83828453599442, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1972.5583384559795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.57532300012826, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 304.52867400072137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.668762762910536, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.21254493580938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.743856165047696, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.1007761496972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:46:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1971.930427499501, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6395.053391500028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.11258741336253, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.79965263333163, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.800313499654294, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.57267850011795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.239623345137028, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.45611448358153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.338959846483151, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.183693710455756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:34:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2650.326251000479, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6090.38060700027, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.98484990135087, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.32542474666631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.89696350009035, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.89052250008899, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.95477410125936, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.143692672131074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.90629757986169, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.69986765979999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:12:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1870.622367499891, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69626.14286649931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.77259321667832, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31168.30876382667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.49170350004351, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30344.20343200054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.819426943181075, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.61807984477645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.086471520709258, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.43252203435824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:40:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16360.486754500016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2051.071790500373, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 214.87865147801796, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.49340154662302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.61797999981354, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.705377999984194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.23870575233246, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.760500238520041, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 139.48319644772516, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.848424723985891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:53:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79396.0280839999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1858.8892060001854, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59363.842118681336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.7504453600286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72635.35564699987, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.39118899980167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.99403910563898, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.863294365573847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.204830952399, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.124532837536922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:12:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3683.4174590003386, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6160.110381499862, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.18027123327434, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.44071540000793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.91020449968346, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.7529545000416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.714879681573194, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.84408546641255, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.938975221457362, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.29976061199906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:29:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5144.204072500543, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7953.783899500081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.63029272668307, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.2286971733347, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.391946499607, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.54343499997867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.518711974408774, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.04505810898644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.08318313158899, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.62338436575791, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:57:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19198.749321500145, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1953.3334455004479, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1733.9371251993132, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.69248994335915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 288.0548259990974, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.62143650019425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.1794857617087, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.45766893648569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.9221872172329, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.79064085537104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 05:13:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6406.453824000038, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2653.223817000253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.01004096666732, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.80828239603458, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.89030799997909, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.46653200028231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.41407391325501, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.888232515726404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.153699872384635, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.860818240535956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:25:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18101.753664499938, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1959.0576464997866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.79141934532768, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.11340747995442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.4439295003467, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.480169999729696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 150.3248974939885, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.237360838286664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.98063748205632, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.315057310507168, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:19:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6116.125251000085, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5529.696613000851, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.95577656666651, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.931280946701, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.52334149983653, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.97463999966567, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.03362274360493, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.11881991682275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.68617036866701, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.249203532127844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71091.10926299945, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259406.86361500047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32103.595689588667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245299.80470982537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31386.506036500577, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246052.46434299988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.23542843331893, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.71551285958857, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.83707845247366, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.99507171601392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:49:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7972.799128499901, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79653.13417749986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.74704809600038, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59612.45751327067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.2453224999217, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72702.21142749983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.0586387482135, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.14168817573746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.609164657590256, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.53047094928182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:38:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64822.7314310002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6115.854938499979, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42262.49112645533, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.14745964796748, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42000.16301999995, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.83024049959931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.80040609375101, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.07349761011697, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.00390717133526, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.15403204615488, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:47:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6158.610826499853, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5146.552168999733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.86893211331275, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.85134442655058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.60312150006575, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.19642650023161, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.851245836175835, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.59965823925252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.351281267023836, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.13763988135345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:56:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1960.810206000133, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2502.701516000343, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.944098916656, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.60287315465757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.437391000457865, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.2339850001299, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.507841379162667, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.695595869744587, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.779278380467078, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.790618902530667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:06:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6102.325008500088, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64031.919593999875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.6733946866619, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41767.55604205267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.94268999993892, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40978.205925499875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.903677714071506, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.63790912789109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.694808280519105, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.86352524807036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 02:31:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2055.765110500033, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3670.86116649989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.66302794668566, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.0698215466873, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.71838249956272, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.82754149990069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.723184369227475, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.60844089167771, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.82769511533102, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.943433647832865, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 04:00:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 260015.09256049985, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6097.099796500061, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245449.95221155128, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.64881523998747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246880.73067000005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.4196249999859, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.5629141692095, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.767088579470645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.7125993259444, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.68399358107581, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-27 03:27:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5494.843467499777, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17989.83830050065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.25016152003082, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.6407266526506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.3125165003221, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.07948700000998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.02541022551337, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.10081188543847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.13659364626864, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.7965519629422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-27 04:39:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" + "name": "Andy Linfoot", + "username": "andy-neuma", + "email": "78757007+andy-neuma@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "a6b94433cc411da4e03724d54c5b158a24cfc6b3", - "message": "update install commands (#264)\n\nUse nm pypi to install.\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm ", - "timestamp": "2024-05-23T22:20:55Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/a6b94433cc411da4e03724d54c5b158a24cfc6b3" + "id": "a160eb945554fe6581427558577c28a07577d53b", + "message": "switch benchmarking and testing jobs to run using \"test\" label (#273)\n\nSUMMARY:\r\n* update benchmarking, testing, and accuracy jobs to run on label\r\n`aws-test-a10g-24G` or `aws-test-a10-96G` which is based on \"vanilla\r\ndeeplearning\" AMI\r\n* update relevant GHA actions and workflows to not be dependent on\r\n`pyenv` virtualenv\r\n* update \"model cache\" to use local disk as opposed to \"EFS\"\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma \r\nCo-authored-by: Domenic Barbuzzi ", + "timestamp": "2024-05-30T23:47:14Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b" }, - "date": 1716884889708, + "date": 1717139962713, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16201.605522999671, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1944.325737500094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 211.75247188932855, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.34725278669308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.3825245000844, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.81818549982563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.57625620982662, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.076899806348088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 138.15701727392738, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.171032847039678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:56:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19580.348333499387, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3637.295575499593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1972.5583384559795, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.26459940323667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 304.52867400072137, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.88423449925176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.21254493580938, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.2629330304988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.1007761496972, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.54240381698151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:35:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6395.053391500028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1918.2469320003293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.79965263333163, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.97013177663636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.57267850011795, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.28486299990254, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.45611448358153, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.22190179143599, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.183693710455756, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.48989199564831, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:24:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6090.38060700027, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2557.2039315002257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.32542474666631, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.180840850661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.89052250008899, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.95183950005958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.143692672131074, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.851389004810983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.69986765979999, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.897805321379792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:05:22 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69626.14286649931, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6099.552767499972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31168.30876382667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.5047685733419, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30344.20343200054, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.46140199991169, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.61807984477645, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.55624090990519, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.43252203435824, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.06870531956898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:02:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2051.071790500373, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11868.245605000084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.49340154662302, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.44991913400673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.705377999984194, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 152.2072479997405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.760500238520041, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.0558582996331, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.848424723985891, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.73852507711578, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:03:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1858.8892060001854, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13403.280489499593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.7504453600286, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.9766981546703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.39118899980167, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.49696250031775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.863294365573847, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.17390393887145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.124532837536922, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.09828485727913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:42:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6160.110381499862, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56598.711176999816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.44071540000793, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34986.424011133335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.7529545000416, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34492.74494500003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.84408546641255, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.16211967293555, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.29976061199906, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.01024498023664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:57:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7953.783899500081, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7510.366780500021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.2286971733347, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.15236881733472, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.54343499997867, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.7449084999962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.04505810898644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.98934003479815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.62338436575791, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.47155360192235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:38:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1953.3334455004479, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73175.14265900014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.69248994335915, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54376.65658664667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.62143650019425, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65844.2635580002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.45766893648569, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.31190011712384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.79064085537104, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.81002672030278, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:09:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2653.223817000253, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5988.901054000053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.80828239603458, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.66245276332938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.46653200028231, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.23423700003241, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.888232515726404, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.098250105891715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.860818240535956, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.04777014300449, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:15:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1959.0576464997866, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6070.1711980000255, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.11340747995442, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.75487558334467, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.480169999729696, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.90729949999331, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.237360838286664, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.47720721275356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.315057310507168, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.12483052955296, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:36:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5529.696613000851, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1837.0898674997989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.931280946701, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.4850439266326, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.97463999966567, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.32419250014209, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.11881991682275, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.571876431442663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.249203532127844, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.806236899356767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:44:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259406.86361500047, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13168.645871999615, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245299.80470982537, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 165.8596725520183, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246052.46434299988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.18640399968717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.71551285958857, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.19142559224628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.99507171601392, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.56268794302693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:29:01 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79653.13417749986, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2387.404865499775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59612.45751327067, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.58159378532824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72702.21142749983, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.98823249995257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.14168817573746, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.715592278812036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.53047094928182, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.002970293327937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:14:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6115.854938499979, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57220.16011000051, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.14745964796748, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24262.49963393664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.83024049959931, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22467.024267500165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.07349761011697, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.6796389991773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.15403204615488, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.46666942799854, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:27:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5146.552168999733, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5815.404029999627, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.85134442655058, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.14034139065431, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.19642650023161, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.83753450047516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.59965823925252, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.66757137009773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.13763988135345, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.88396688034135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 05:16:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2502.701516000343, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5082.756575499843, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.60287315465757, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.22497969325438, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.2339850001299, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.9079714996551, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.695595869744587, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.283243296958865, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.790618902530667, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.845509096645735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 03:48:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64031.919593999875, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 250019.64793699994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41767.55604205267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235926.69918582065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40978.205925499875, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 237063.49971650002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.63790912789109, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.50191722485226, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.86352524807036, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.43693336512705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:47:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3670.86116649989, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6359.681814499936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.0698215466873, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.18178417332972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.82754149990069, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.56043249996446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.60844089167771, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.18134608225374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.943433647832865, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.889334661983305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-28 04:33:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6097.099796500061, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5235.807850499441, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.64881523998747, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.38514000669238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.4196249999859, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.369899999263, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.767088579470645, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.002259343469056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.68399358107581, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.27258014968854, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 02:32:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17989.83830050065, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2030.2278369995292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.6407266526506, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.64925478003109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.07948700000998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.549836000333016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.10081188543847, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.573610354072041, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.3.0\", \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.7965519629422, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.648292482642121, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.3.0\",\n \"python_version\": \"3.9.17 (main, May 10 2024, 13:34:20) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-28 04:22:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -84678,668 +84678,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-30T23:47:14Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b" }, - "date": 1717139962713, + "date": 1717140262706, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1944.325737500094, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3681.097016500189, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.34725278669308, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.07798170667533, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.81818549982563, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.51540750069398, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.076899806348088, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.4821328702638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.171032847039678, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.905893282612915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:27:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3637.295575499593, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6092.723453499957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.26459940323667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.56179224000728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.88423449925176, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.33099699994273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.2629330304988, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.01282205005549, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.54240381698151, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.60662206821262, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:24:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1918.2469320003293, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77949.97632699984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.97013177663636, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58397.31349270267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.28486299990254, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71317.3302465002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.22190179143599, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.33699641951138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.48989199564831, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.32680799715409, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:58:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2557.2039315002257, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16155.477776499993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.180840850661, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.6813949946748, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.95183950005958, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 166.9866280003589, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.851389004810983, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.65772220447064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.897805321379792, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.31033031322832, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:04:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6099.552767499972, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5469.732399999884, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.5047685733419, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.56679687196933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.46140199991169, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.89766099979897, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.55624090990519, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.890397382191004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.06870531956898, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.03514549061607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:50:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11868.245605000084, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6405.0570609999795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.44991913400673, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.46892652666808, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 152.2072479997405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.12654699997256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.0558582996331, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.48747596856293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.73852507711578, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.15420376744941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:45:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13403.280489499593, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69819.26233450031, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.9766981546703, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31218.575220786664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.49696250031775, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30551.620906000608, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.17390393887145, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.78076755449828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.09828485727913, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.4536070982777, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:11:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56598.711176999816, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1864.540482999928, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34986.424011133335, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.86396844667918, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34492.74494500003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.090040000181034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.16211967293555, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.848978666162784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.01024498023664, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.125742792799205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:41:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7510.366780500021, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1952.8336919997855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.15236881733472, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.46964666999945, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.7449084999962, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.690230000091105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.98934003479815, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.52746929280426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.47155360192235, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.802806236467346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:32:43 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73175.14265900014, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65052.52764399995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54376.65658664667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42204.86256041533, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65844.2635580002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41962.35900250008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.31190011712384, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.4298790739846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.81002672030278, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.9204985423722, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:05:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5988.901054000053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17808.83305749967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.66245276332938, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 233.17962850800117, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.23423700003241, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.84594550020483, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.098250105891715, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.34437494057235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.04777014300449, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 152.39974601979338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:25:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6070.1711980000255, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257918.9633305002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.75487558334467, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243078.29471475002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.90729949999331, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244911.33549849998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.47720721275356, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.08091762761569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.12483052955296, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.5571945066391, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:57:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1837.0898674997989, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5137.703709498965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.4850439266326, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.3516359397375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.32419250014209, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.1904954995407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.571876431442663, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.53577292241982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.806236899356767, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.10265720510569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:33:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13168.645871999615, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6097.918027500782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 165.8596725520183, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.9422584399678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.18640399968717, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.72214050017647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.19142559224628, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.01082736972813, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.56268794302693, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.926743500999976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:07:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2387.404865499775, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6231.304446499735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.58159378532824, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.27052092667206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.98823249995257, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.09737800015137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.715592278812036, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.07329957322707, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.002970293327937, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.51991764659937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:39:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57220.16011000051, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1966.5706364999096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24262.49963393664, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.57006852665896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 22467.024267500165, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.585513499740046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.6796389991773, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.250328231535653, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.46666942799854, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.308544198746658, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:41:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5815.404029999627, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19427.53360250026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.14034139065431, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1855.6937588053127, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.83753450047516, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 296.91589050071343, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.66757137009773, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.95018913828906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.88396688034135, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.70638199244252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:00:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5082.756575499843, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2056.6150060003565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.22497969325438, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.96489467331655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.9079714996551, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.91846550025002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.283243296958865, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.71997602230433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.845509096645735, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.823541390495773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:52:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 250019.64793699994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7941.3647344999845, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235926.69918582065, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.8378843453329, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 237063.49971650002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.93674150008883, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.50191722485226, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.73406646219327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.43693336512705, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.449693636550265, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:20:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6359.681814499936, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2634.1090030000487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.18178417332972, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.19483824533867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.56043249996446, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.74080150042573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.18134608225374, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.89030300157944, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.889334661983305, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.869823937415298, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:19:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5235.807850499441, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2499.371483000232, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.38514000669238, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.43903942266479, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.369899999263, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.38136549925912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.002259343469056, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.588815181987712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.27258014968854, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.76198581413518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:32:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2030.2278369995292, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6090.403449000064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.64925478003109, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.24602427333352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.549836000333016, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.39523949996419, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.573610354072041, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.83419598811897, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.648292482642121, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.633694117844954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:52:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -85360,1350 +85360,1350 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-30T23:47:14Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b" }, - "date": 1717140262706, + "date": 1717140353849, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3681.097016500189, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6101.936902499801, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.07798170667533, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.02423784131679, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.51540750069398, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.86930899896106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.4821328702638, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.97718879974617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.905893282612915, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.95826503696106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:25:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6092.723453499957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19787.037736000457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.56179224000728, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2235.3907888513045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.33099699994273, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 372.0090970000456, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.01282205005549, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.99269413570408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.60662206821262, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.18166033948762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:59:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77949.97632699984, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18575.993863500116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58397.31349270267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235.9791442813427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71317.3302465002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 188.92866600026537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.33699641951138, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.52913859861425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.32680799715409, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.80588856760255, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:08:29 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16155.477776499993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2633.929095500207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.6813949946748, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.34896825732964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 166.9866280003589, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.49028099969291, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.65772220447064, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.880498046451304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.31033031322832, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.826315582132132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:48:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5469.732399999884, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5154.42712749973, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.56679687196933, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.0732080667949, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.89766099979897, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.8526425004311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.890397382191004, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.63378935697634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.03514549061607, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.18654722510851, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6405.0570609999795, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5510.173583000324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.46892652666808, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.90581848666866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.12654699997256, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.44350299936195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.48747596856293, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.12222401973724, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.15420376744941, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.3146013820182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:21:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69819.26233450031, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6160.090393999781, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31218.575220786664, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.7565511466837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30551.620906000608, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.41715999987719, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.78076755449828, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.85193974046793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.4536070982777, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.32121772631665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:44:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1864.540482999928, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1968.2251990002442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.86396844667918, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.58126243000636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.090040000181034, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.73574000017834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.848978666162784, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.535554081098677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.125742792799205, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.841490286731958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1952.8336919997855, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2500.1692185001048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.46964666999945, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.6543817773151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.690230000091105, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.94254399950296, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.52746929280426, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.658654500287465, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.802806236467346, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.88008315499938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:01:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65052.52764399995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16548.779949500386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42204.86256041533, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 217.80040674933417, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41962.35900250008, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.68737250040067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.4298790739846, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.78960418614363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.9204985423722, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141.47805946059194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:43:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17808.83305749967, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6403.514041999983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 233.17962850800117, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.89103688666287, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.84594550020483, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.60771150000755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.34437494057235, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.45230292008293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 152.39974601979338, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.14832892103383, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:14:37 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257918.9633305002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1967.7691030005917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243078.29471475002, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.52202370666905, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244911.33549849998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.060174000293046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.08091762761569, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.238786017536206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.5571945066391, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.315820831757966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:23:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5137.703709498965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79659.19065699996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.3516359397375, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59643.81296608933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.1904954995407, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72820.22716649999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.53577292241982, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.75460141925505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.10265720510569, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.82052626066309, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:54:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6097.918027500782, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3681.1479669995606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.9422584399678, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.8134585365939, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.72214050017647, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.1684270001133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.01082736972813, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.65994576060037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.926743500999976, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.927389066522032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:02:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6231.304446499735, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6075.782038499938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.27052092667206, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.86152282999653, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.09737800015137, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.82327049992455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.07329957322707, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.00520032747418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.51991764659937, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.601370668698856, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:52:45 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1966.5706364999096, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1854.4511349996355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.57006852665896, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.31250808664421, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.585513499740046, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.06378100002985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.250328231535653, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.87062517806835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.308544198746658, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.122588032508888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19427.53360250026, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6109.639319000053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1855.6937588053127, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.6489557500087, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 296.91589050071343, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.53572450000502, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.95018913828906, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.853244697887696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.70638199244252, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.62685607274077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:09:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2056.6150060003565, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258289.11931399966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.96489467331655, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243556.90976791063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.91846550025002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245145.88887850026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.71997602230433, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.76708780687586, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.823541390495773, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.04582460958845, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:55:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7941.3647344999845, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7979.495186500003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.8378843453329, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.29165735467421, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.93674150008883, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.5408804999406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.73406646219327, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.75648220407465, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.449693636550265, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.42222664757649, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:34:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2634.1090030000487, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2061.239989999649, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.19483824533867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.57219613999648, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.74080150042573, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54033799982426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.89030300157944, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.749096802673108, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.869823937415298, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.85373386428935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:07:44 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2499.371483000232, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69805.44086600002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.43903942266479, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31145.775594269337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.38136549925912, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30031.524023999737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.588815181987712, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.17454173585992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.76198581413518, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.79648379991565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6090.403449000064, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63371.482394500046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.24602427333352, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40907.49880209133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.39523949996419, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40442.42481850006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.83419598811897, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.60641739782716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.633694117844954, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.15978282274332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:27:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Andy Linfoot", - "username": "andy-neuma", - "email": "78757007+andy-neuma@users.noreply.github.com" + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "dbarbuzzi@gmail.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "a160eb945554fe6581427558577c28a07577d53b", - "message": "switch benchmarking and testing jobs to run using \"test\" label (#273)\n\nSUMMARY:\r\n* update benchmarking, testing, and accuracy jobs to run on label\r\n`aws-test-a10g-24G` or `aws-test-a10-96G` which is based on \"vanilla\r\ndeeplearning\" AMI\r\n* update relevant GHA actions and workflows to not be dependent on\r\n`pyenv` virtualenv\r\n* update \"model cache\" to use local disk as opposed to \"EFS\"\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma \r\nCo-authored-by: Domenic Barbuzzi ", - "timestamp": "2024-05-30T23:47:14Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/a160eb945554fe6581427558577c28a07577d53b" + "id": "019467500eee9542833aedd6e2c17bd62ffa4b40", + "message": "Handle server startup failure in __enter__ (#274)\n\nWith a context manager class, the `__exit__` method is not called when\r\nan exception is raised during the context manager’s `__enter__` method.\r\nThis PR addresses that by manually calling that method if an exception\r\nis raised.", + "timestamp": "2024-05-31T14:44:40Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717140353849, + "date": 1717180644086, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6101.936902499801, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2483.930193499873, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.02423784131679, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.35805675598749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.86930899896106, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.19903650019478, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.97718879974617, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.4518560301072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.95826503696106, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.64258402208881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:03:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19787.037736000457, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64156.925899000045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2235.3907888513045, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41265.56946777867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 372.0090970000456, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40700.54304000007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.99269413570408, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.41443055996379, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.18166033948762, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.44261223264846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 05:11:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18575.993863500116, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6044.974260001254, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235.9791442813427, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.41275711870792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 188.92866600026537, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.05784200073685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.52913859861425, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.7597112437143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.80588856760255, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.7405560297403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:16:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2633.929095500207, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5139.528454000356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.34896825732964, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.85536933997113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.49028099969291, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.92951749937492, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.880498046451304, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.51518399288244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.826315582132132, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.096406452598835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:09:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5154.42712749973, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6079.625620499883, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.0732080667949, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.71143561999966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.8526425004311, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.30147799994847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.63378935697634, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.055339955676246, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.18654722510851, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.657973738382104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:56:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5510.173583000324, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17561.944785500145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.90581848666866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.8076672553434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.44350299936195, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 187.1709194997493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.12222401973724, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.49130965311733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.3146013820182, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.549457722834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:35:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6160.090393999781, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6065.366650499982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.7565511466837, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.55514536999408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.41715999987719, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.8598694999746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.85193974046793, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.68262714845876, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.32121772631665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.64022278004714, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:53:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1968.2251990002442, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1855.0040115001138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.58126243000636, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.68821522665712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.73574000017834, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.1674144998251, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.535554081098677, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.802702506272544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.841490286731958, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.06161011869142, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 04:02:43 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2500.1692185001048, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257382.11322200004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.6543817773151, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242495.9619792253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.94254399950296, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244313.38900100012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.658654500287465, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.75110738836511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.88008315499938, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.94275516991033, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:42:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16548.779949500386, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1969.4924140003423, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 217.80040674933417, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.35959133329622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.68737250040067, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.487208000286046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.78960418614363, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.248569927175273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 141.47805946059194, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.295491081985142, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:49:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6403.514041999983, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3673.7472699996943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.89103688666287, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.2571111866861, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.60771150000755, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.48925600029179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.45230292008293, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.44395916409429, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.14832892103383, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.86383695044318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:22:31 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1967.7691030005917, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68155.64958150026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.52202370666905, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30194.675852978682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.060174000293046, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 28690.57850199988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.238786017536206, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 190.7608596009059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.315820831757966, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 201.7939926306953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:30:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79659.19065699996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6151.039890499987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59643.81296608933, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.85566856666021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72820.22716649999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.88881099969512, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.75460141925505, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.84323044008759, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.82052626066309, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.29530609472412, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:09:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3681.1479669995606, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2053.6112204995334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.8134585365939, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.4595237199804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.1684270001133, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.40970950022893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.65994576060037, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.703956566091634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.927389066522032, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.811263026890341, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:26:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6075.782038499938, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18710.82966599988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.86152282999653, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1399.1507489806645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.82327049992455, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 249.77231499997288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.00520032747418, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.22021375171903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.601370668698856, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.86078924680373, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:00:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1854.4511349996355, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1946.9225399998322, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.31250808664421, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.67453586000799, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.06378100002985, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.83936699997139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.87062517806835, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.44668294658585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.122588032508888, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.679400031987392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:36:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6109.639319000053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80104.1921975002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.6489557500087, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60030.959854124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.53572450000502, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72977.70491000006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.853244697887696, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.59751619510877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.62685607274077, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.67641990006418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:28:59 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258289.11931399966, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6391.792147499984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243556.90976791063, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.53380199333787, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245145.88887850026, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.93591650001463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.76708780687586, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.42738148463032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.04582460958845, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.14441306690115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:24:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7979.495186500003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7917.227143500099, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.29165735467421, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.27479189600157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.5408804999406, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.1375730000227, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.75648220407465, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.54781930375723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.42222664757649, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.22668530605768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:35:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2061.239989999649, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5409.386085500046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.57219613999648, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.59242691737745, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54033799982426, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 177.04604800019297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.749096802673108, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.6985874906362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.85373386428935, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.919782852467804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 03:56:27 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69805.44086600002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2628.640983999958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31145.775594269337, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.39504159467955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30031.524023999737, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.09308050022446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.17454173585992, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.776976989626807, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.79648379991565, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.794335007256045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 04:45:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63371.482394500046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15499.0497900003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40907.49880209133, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 208.6239780786691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40442.42481850006, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 167.15830099974482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.60641739782716, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.87639899359934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.15978282274332, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.46732669284876, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 02:44:20 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -86724,668 +86724,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717180644086, + "date": 1717226724529, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2483.930193499873, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13433.544474500195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.35805675598749, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 209.0220402253411, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.19903650019478, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.21709350004676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.4518560301072, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.42860481970348, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.64258402208881, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.95830541999374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:57:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64156.925899000045, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6026.254502000256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41265.56946777867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.55780948666447, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40700.54304000007, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.32719400016686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.41443055996379, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.456075298873564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.44261223264846, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.16743588026698, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:58:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6044.974260001254, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57687.95476100058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.41275711870792, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24589.487326492675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.05784200073685, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23176.755627000603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.7597112437143, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.31298034386248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.7405560297403, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.1832146050638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:17:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5139.528454000356, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56125.272259500096, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.85536933997113, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34504.20830972532, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.92951749937492, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34041.64477699999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.51518399288244, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.96340996331081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.096406452598835, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.10117683706032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:10:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6079.625620499883, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2597.9437940004573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.71143561999966, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.2349371866864, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.30147799994847, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.46986550031943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.055339955676246, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.01261998820881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.657973738382104, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.105187230427656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:15:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17561.944785500145, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2386.4957324999523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.8076672553434, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.10263050399833, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 187.1709194997493, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.63464149947686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.49130965311733, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.781622552837995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.549457722834, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.980207944737126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:30:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6065.366650499982, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 249056.0881115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.55514536999408, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 234769.65152463395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.8598694999746, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235907.7511035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.68262714845876, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.73109339173806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.64022278004714, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.4586457380798, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:43:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1855.0040115001138, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6099.945930500098, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.68821522665712, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.01067297334339, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.1674144998251, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.5510320001631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.802702506272544, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.51520349307166, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.06161011869142, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.0217535809248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:51:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257382.11322200004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6369.022003499936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242495.9619792253, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.86370787999249, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244313.38900100012, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.98295800001597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.75110738836511, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.19853741333624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.94275516991033, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.854873681434434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:38:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1969.4924140003423, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3643.2665580000503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.35959133329622, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141.6193678199003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.487208000286046, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.86853350022284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.248569927175273, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.360144188083904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.295491081985142, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.628681514108088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:45:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3673.7472699996943, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5795.081401499374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.2571111866861, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.1639660506286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.48925600029179, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.6784984991682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.44395916409429, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.65340047924928, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.86383695044318, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.83244374329012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:40:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68155.64958150026, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1941.5869834997466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30194.675852978682, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.86718834666681, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 28690.57850199988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.243063499725395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 190.7608596009059, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.086835355884213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 201.7939926306953, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.167377682341602, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:59:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6151.039890499987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13001.913375500408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.85566856666021, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 164.54864900934749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.88881099969512, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.40938450053363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.84323044008759, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.63882724243625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.29530609472412, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.19328698760164, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:08:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2053.6112204995334, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5079.459464000138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.4595237199804, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.60726572002507, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.40970950022893, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.30312999964372, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.703956566091634, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.200136683882086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.811263026890341, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.799919584297072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18710.82966599988, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1915.7010099993386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1399.1507489806645, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.33896263003044, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 249.77231499997288, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.913551500154426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.22021375171903, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.154389783356644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.86078924680373, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.435429443586605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 16:24:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1946.9225399998322, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7518.360590500038, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.67453586000799, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.8976876106687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.83936699997139, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.7351630000303, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.44668294658585, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.1581977850762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.679400031987392, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.68204937520141, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:17:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80104.1921975002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5995.356360000073, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60030.959854124, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.52373509334674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72977.70491000006, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.87627850002855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.59751619510877, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.286603577508885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.67641990006418, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.21856208176765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 14:23:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6391.792147499984, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5242.797461000009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.53380199333787, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.47744490133482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.93591650001463, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.1728239997683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.42738148463032, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.08456078704941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.14441306690115, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.247383541717056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:36:41 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7917.227143500099, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1849.2760655003622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.27479189600157, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.93386266001713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.1375730000227, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.14978950027216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.54781930375723, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.566374270951178, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.22668530605768, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.79882193081735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 13:49:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5409.386085500046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 74319.16772249997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.59242691737745, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54905.67989840266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 177.04604800019297, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67115.6059944999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.6985874906362, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.20423739689029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.919782852467804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.20086864799721, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-05-31 15:49:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2628.640983999958, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11776.748068499728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.39504159467955, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.14932382466395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.09308050022446, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 151.4051315002689, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.776976989626807, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.38786759309662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.794335007256045, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.01644828327463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:23:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15499.0497900003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2038.1226959998457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 208.6239780786691, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.25553335331158, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 167.15830099974482, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.933951999752026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.87639899359934, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.60494243109782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.46732669284876, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.695432862760343, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-05-31 15:04:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -87406,668 +87406,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717226724529, + "date": 1717226809492, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13433.544474500195, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5465.104090500063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 209.0220402253411, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.38181604533747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.21709350004676, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.48556349932187, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.42860481970348, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.01814435216436, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.95830541999374, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.23224205729735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:16:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6026.254502000256, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64389.11264650028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.55780948666447, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41904.01268081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.32719400016686, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41665.827635500136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.456075298873564, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.50732660572248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.16743588026698, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.06239928118974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57687.95476100058, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1968.1510505001825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24589.487326492675, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.35354207994776, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23176.755627000603, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.13816049994057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.31298034386248, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.212979792208433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.1832146050638, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.30842540938585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56125.272259500096, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6409.06140650003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34504.20830972532, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.28681744666224, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34041.64477699999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.82857050002167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.96340996331081, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.45402963019084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.10117683706032, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.183773724756755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:34 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2597.9437940004573, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5139.758131000235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.2349371866864, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.55492249328138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.46986550031943, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.3277339999695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.01261998820881, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.51161150505591, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.105187230427656, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.122926756589106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:09:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2386.4957324999523, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1952.253523500076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.10263050399833, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.06344014667896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.63464149947686, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.132333999921684, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.781622552837995, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.518420134876393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15.980207944737126, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.81889525331887, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 249056.0881115, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15772.96817550041, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 234769.65152463395, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 212.41135819334036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235907.7511035, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 168.6653629999455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.73109339173806, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.16649284113906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.4586457380798, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.16589601969588, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:25:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6099.945930500098, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7947.8744014999165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.01067297334339, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.44730231334324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.5510320001631, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.29116350002823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.51520349307166, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.84342651133543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.0217535809248, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.437561346413524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:26 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6369.022003499936, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3682.9004324999914, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.86370787999249, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.17348532340233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.98295800001597, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.40964100010751, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.19853741333624, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.450300879533238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.854873681434434, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.877735847614165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:24:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3643.2665580000503, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6091.999458499913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 141.6193678199003, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.25372375333382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.86853350022284, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.15678899995783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.360144188083904, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.74735498837957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.628681514108088, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.65306078308095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:29:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5795.081401499374, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80321.45212449995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.1639660506286, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60084.608386216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.6784984991682, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73040.18949800024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.65340047924928, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.53753827580553, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.83244374329012, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.11969253932963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1941.5869834997466, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2639.5517854998616, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.86718834666681, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.02935196665324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.243063499725395, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.91525050001292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.086835355884213, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.807965012021203, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.167377682341602, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.892292434902217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13001.913375500408, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2058.8162895001005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 164.54864900934749, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.24307422668062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.40938450053363, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.71235049993993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.63882724243625, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.78980069313554, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.19328698760164, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.875658441741487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5079.459464000138, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6143.030510499784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.60726572002507, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.48958786333544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.30312999964372, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.64175400015483, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.200136683882086, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.00522969330008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.799919584297072, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.680604117973, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1915.7010099993386, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17717.51661799999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.33896263003044, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 229.6791376726675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.913551500154426, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.0309030003482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.154389783356644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.70600372650583, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.435429443586605, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 151.8096421807653, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:03:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7518.360590500038, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19371.194835000097, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.8976876106687, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1825.0263110479912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.7351630000303, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 296.2281604995951, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.1581977850762, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.1624820135071, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.68204937520141, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.04514007011437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5995.356360000073, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68792.75241000051, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.52373509334674, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30492.831526224, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.87627850002855, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29110.252202000083, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.286603577508885, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.33320536899203, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.21856208176765, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.79778100221915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:33 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5242.797461000009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6151.4509609999095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.47744490133482, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.6259924333493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.1728239997683, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.0594135000756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.08456078704941, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.80231963248332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.247383541717056, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.29096479789972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1849.2760655003622, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256516.0198184999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.93386266001713, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242469.63911015532, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.14978950027216, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242928.41760100055, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.566374270951178, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.73875155506106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.79882193081735, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.58589442037847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 74319.16772249997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2521.811208499912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54905.67989840266, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.12486600530733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67115.6059944999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.18184749966895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.20423739689029, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.673919768941687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.20086864799721, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.760043884738042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:10:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11776.748068499728, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1856.4668409999285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.14932382466395, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.92989755330139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 151.4051315002689, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.78311549956561, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.38786759309662, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.772554630902334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.01644828327463, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.100548923487823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:50:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2038.1226959998457, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6128.83922400124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.25553335331158, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.032034792023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.933951999752026, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.85366000019712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.60494243109782, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.9551760090202, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.695432862760343, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.91666134291712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:57:35 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -88088,668 +88088,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717226809492, + "date": 1717226907541, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5465.104090500063, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2638.090289000047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.38181604533747, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.13079550531378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.48556349932187, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.69665100023849, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.01814435216436, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.85371453968665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.23224205729735, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.748642085078284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:36:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64389.11264650028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17609.569553000256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41904.01268081, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 231.22858862998328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41665.827635500136, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 186.24312999963877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.50732660572248, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.26747173435956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.06239928118974, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 151.15857947976536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:44:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1968.1510505001825, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6160.346129499885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.35354207994776, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.34277093999967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.13816049994057, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.88684999992256, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.212979792208433, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.80396775220805, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.30842540938585, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.31792395837699, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6409.06140650003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6098.687131000588, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.28681744666224, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.37724877200769, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.82857050002167, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.9813774994982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.45402963019084, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.648997186359274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.183773724756755, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.64992248008313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:22:19 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5139.758131000235, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69584.89189899956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.55492249328138, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31048.29460035534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.3277339999695, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30093.43344050012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.51161150505591, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.66771929707056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.122926756589106, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.56901990804454, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:57:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1952.253523500076, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2483.630244500091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.06344014667896, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.69600437866029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.132333999921684, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.64083249989562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.518420134876393, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.522834716305688, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.81889525331887, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.63574917771536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15772.96817550041, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7919.601063499954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 212.41135819334036, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.3910337440011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 168.6653629999455, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.13522700000613, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.16649284113906, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.9569065085812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.16589601969588, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.51430730106501, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7947.8744014999165, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1949.4820319996506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.44730231334324, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.94478731334432, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.29116350002823, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.578827499943145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.84342651133543, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.49369730497954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.437561346413524, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.834836465259444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:35:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3682.9004324999914, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5395.93860450077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.17348532340233, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.97643779730666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.40964100010751, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.70760699971288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.450300879533238, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.696208352486806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.877735847614165, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.85903324170393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:27:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6091.999458499913, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1958.2405209998797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.25372375333382, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.26579094668462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.15678899995783, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.981102999514405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.74735498837957, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.196314056028056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.65306078308095, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.308551286685358, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:28:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80321.45212449995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19417.44170650054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60084.608386216, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1984.1171261193424, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73040.18949800024, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 299.54049000025407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.53753827580553, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.0769743161095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.11969253932963, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.35185909605494, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2639.5517854998616, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64378.06861699982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.02935196665324, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41894.84862783465, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.91525050001292, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41211.34732250005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.807965012021203, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.44607479219636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.892292434902217, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.86384090894646, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2058.8162895001005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6099.562576000097, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.24307422668062, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.94176616332773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.71235049993993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.25698550010475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.78980069313554, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.78026131030745, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.875658441741487, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.56709518774522, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:12 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6143.030510499784, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2051.8238935001136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.48958786333544, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.6770349466739, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.64175400015483, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.637954500150954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.00522969330008, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.685178672366225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.680604117973, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.807880624142099, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17717.51661799999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1862.6115929996558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 229.6791376726675, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.27098721333701, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.0309030003482, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.117498499745125, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.70600372650583, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.844234496377625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 151.8096421807653, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.151565290528257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19371.194835000097, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6394.527255500008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1825.0263110479912, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.05473383999576, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 296.2281604995951, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.38765350000631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.1624820135071, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.40613075942994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.04514007011437, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.10810124192214, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68792.75241000051, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6068.195037999885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30492.831526224, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.8231543266902, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29110.252202000083, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.73207999993792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.33320536899203, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.951069390166126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.79778100221915, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54447877075444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:46:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6151.4509609999095, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15614.073387499957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.6259924333493, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 215.67482790600783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.0594135000756, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.90871850036638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.80231963248332, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.92911536352358, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.29096479789972, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.3077546218594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256516.0198184999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5134.5485170004395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242469.63911015532, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.24353658669618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242928.41760100055, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.83863849916816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.73875155506106, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.537869481502305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.58589442037847, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.134182925720367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2521.811208499912, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78420.08978000013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.12486600530733, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58702.017414699985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.18184749966895, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72048.7134335001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.673919768941687, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.3635887082443, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.760043884738042, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.80326064012668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1856.4668409999285, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257630.41754999995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.92989755330139, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244192.06504996668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.78311549956561, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244848.77846500013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.772554630902334, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.32903447195388, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.100548923487823, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.0383085146902, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6128.83922400124, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3678.4869454995714, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.032034792023, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.53516155332423, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.85366000019712, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.8544999989972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.9551760090202, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.619680555833128, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.91666134291712, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.842106511884577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:04:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -88770,668 +88770,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717226907541, + "date": 1717226920680, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2638.090289000047, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5153.267233499719, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.13079550531378, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.72724149327648, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.69665100023849, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.75649099886505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.85371453968665, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.56325137920866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.748642085078284, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.135406988289148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:11:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17609.569553000256, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6411.88908099997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 231.22858862998328, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.83134152666541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 186.24312999963877, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.34774449998804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.26747173435956, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.46555021775157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 151.15857947976536, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.16765750887564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:18:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6160.346129499885, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5465.795638499003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.34277093999967, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.45962854265963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.88684999992256, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.95940000055998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.80396775220805, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.97504323687587, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.31792395837699, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.226725419496674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:56:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6098.687131000588, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80529.45460850015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.37724877200769, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60326.79254720133, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.9813774994982, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73896.06737299982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.648997186359274, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.78470235412063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.64992248008313, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.91019339517356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:07:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69584.89189899956, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6095.397781999964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31048.29460035534, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.82052522332332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30093.43344050012, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.79779149999922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.66771929707056, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.91476485872919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.56901990804454, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.6967848006478, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2483.630244500091, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259042.47589599958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.69600437866029, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244226.45942004398, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.64083249989562, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245914.3064604996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.522834716305688, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.77792115304503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.63574917771536, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.17802914457835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:45:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7919.601063499954, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1956.863275499927, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.3910337440011, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.70074537330827, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.13522700000613, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54534800050169, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.9569065085812, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.222250544463073, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.51430730106501, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.288621271329562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:38:27 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1949.4820319996506, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3676.1369844998626, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.94478731334432, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.38824281663983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.578827499943145, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.10873100043682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.49369730497954, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.436373215296747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.834836465259444, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.844668904210636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:05:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5395.93860450077, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7972.256407499913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.97643779730666, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.6752595119912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.70760699971288, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.30707749999237, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.696208352486806, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.938903093863736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.85903324170393, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.552133147019305, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:39:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1958.2405209998797, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70102.49670000031, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.26579094668462, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31426.950578070013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.981102999514405, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30875.1951510003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.196314056028056, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.71480064987958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.308551286685358, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.45817038187403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:33:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19417.44170650054, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6055.798128000788, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1984.1171261193424, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.03516805731974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 299.54049000025407, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.93307399992045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.0769743161095, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.60481309716015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.35185909605494, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.55947662492116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:15:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64378.06861699982, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2646.620501999678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41894.84862783465, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.66360963468833, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41211.34732250005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.93796700006351, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.44607479219636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.870622147981955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.86384090894646, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.90668610635907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:47:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6099.562576000097, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6173.267183000235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.94176616332773, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.52148489999067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.25698550010475, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.1482324996523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.78026131030745, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.879281108834356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.56709518774522, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.40217023112008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:31:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2051.8238935001136, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1963.2678400007535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.6770349466739, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.80734718000774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.637954500150954, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.49763450007595, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.685178672366225, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.528203331464441, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.807880624142099, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.879467540563104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:55 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1862.6115929996558, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15955.817038000077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.27098721333701, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 210.95193908932862, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.117498499745125, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 168.29410699983782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.844234496377625, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.05553459825703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.151565290528257, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.93621821245296, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:39:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6394.527255500008, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6079.774405500075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.05473383999576, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.21347402667318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.38765350000631, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.48538349987211, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.40613075942994, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.91949216651743, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.10810124192214, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54900252614391, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:25:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6068.195037999885, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65760.63130699993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.8231543266902, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42493.21184977733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.73207999993792, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42642.46320699988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.951069390166126, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.45283879670744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54447877075444, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.5090991386381, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:03:21 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15614.073387499957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19413.51446599947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 215.67482790600783, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1911.2615646006827, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.90871850036638, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 301.560390999839, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.92911536352358, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 160.18905535696663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.3077546218594, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.03388925635815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:52:10 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5134.5485170004395, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2055.1594009998553, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.24353658669618, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.2215669666851, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.83863849916816, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.70422399965901, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.537869481502305, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.804952142759143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.134182925720367, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.80095409422303, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:00:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78420.08978000013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2505.315351000263, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58702.017414699985, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.81682827600282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72048.7134335001, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.92226000033043, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.3635887082443, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.642578608419566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.80326064012668, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.85448758325757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257630.41754999995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1873.8067115000376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244192.06504996668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.4497275599612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244848.77846500013, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.6132789999283, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.32903447195388, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.815606907763367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.0383085146902, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.124262533905293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3678.4869454995714, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18127.93551750019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.53516155332423, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.99471685199507, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.8544999989972, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.33619399961754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.619680555833128, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.28410064460172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.842106511884577, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.7595891936329, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:31:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -89452,668 +89452,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717226920680, + "date": 1717399193605, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5153.267233499719, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2055.8448110000427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.72724149327648, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.56332087997241, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.75649099886505, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.486727499934204, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.56325137920866, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.734279113870905, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.135406988289148, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.832082933123225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:59:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6411.88908099997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 255519.67455000023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.83134152666541, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 241553.6490950207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.34774449998804, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242416.74192250002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.46555021775157, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.40638422233403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.16765750887564, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.7109178209585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:23:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5465.795638499003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6064.189869000074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.45962854265963, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.64453962000455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.95940000055998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.92926149986124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.97504323687587, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.951733724818304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.226725419496674, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.57946328211994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:38:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80529.45460850015, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3678.7117545, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60326.79254720133, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.70120577671452, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73896.06737299982, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.0109705006762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.78470235412063, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.657908070816337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.91019339517356, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.014463232733515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:11:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6095.397781999964, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18655.240721000155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.82052522332332, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1362.2770440846543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.79779149999922, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 262.8703115005919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.91476485872919, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.27636172303048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.6967848006478, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.06888759254596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:30:18 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259042.47589599958, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2627.0912965001116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244226.45942004398, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.82107498667513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245914.3064604996, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.28235999983735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.77792115304503, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.810647724409144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.17802914457835, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.80232702724121, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:26:14 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1956.863275499927, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6131.326911000315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.70074537330827, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.16482551332592, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54534800050169, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.90202300003511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.222250544463073, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.78329706453252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.288621271329562, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.25952119505826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:32:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3676.1369844998626, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69814.66886199996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.38824281663983, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31225.528803002006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.10873100043682, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30391.95524150091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.436373215296747, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.78759748293524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.844668904210636, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.26347417634994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:30:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7972.256407499913, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7891.291946000024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.6752595119912, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.65347058000043, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.30707749999237, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.4609829999872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.938903093863736, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.602982839799644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.552133147019305, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.20217650230741, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:37:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70102.49670000031, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6055.366905000483, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31426.950578070013, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.88969452534124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30875.1951510003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.752284000293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.71480064987958, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.74072664358886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.45817038187403, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.74285928966727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 04:48:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6055.798128000788, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1955.5083455002205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.03516805731974, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.00293331333948, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.93307399992045, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.8722764995473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.60481309716015, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.454777052584479, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.55947662492116, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.78043380833142, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:06:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2646.620501999678, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15412.548571500338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.66360963468833, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 210.16727801800093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.93796700006351, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 163.89254700015954, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.870622147981955, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.69744632406882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.90668610635907, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.96581216853258, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:10:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6173.267183000235, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16840.66650349996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.52148489999067, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 220.79072606800037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.1482324996523, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.54757699972834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.879281108834356, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.65848210703913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.40217023112008, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.03363637500365, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:55:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1963.2678400007535, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2459.1634369999156, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.80734718000774, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.2581605826866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.49763450007595, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.49792600008732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.528203331464441, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.410164495557872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.879467540563104, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.566398486916665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:04:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15955.817038000077, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5129.752017000101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 210.95193908932862, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.16210111339994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 168.29410699983782, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.41876699945715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.05553459825703, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.49815985398641, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.93621821245296, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.062567575039832, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:51:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6079.774405500075, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5430.696962000184, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.21347402667318, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.9749076986797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.48538349987211, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.23178100014047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.91949216651743, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.03292704847647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54900252614391, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.00412580783481, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:02:34 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65760.63130699993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6086.761909499955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42493.21184977733, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.48856736667524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42642.46320699988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.618281999994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.45283879670744, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.758815015615966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.5090991386381, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.57903657881813, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 02:45:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19413.51446599947, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6387.866823999957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1911.2615646006827, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.88967173333322, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 301.560390999839, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.6485330000437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 160.18905535696663, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.42783907498824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.03388925635815, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.13229719876569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-01 05:14:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2055.1594009998553, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78795.04042050007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.2215669666851, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59064.32769362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.70422399965901, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72272.24899349995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.804952142759143, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.32638058805797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.80095409422303, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.77161860426683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:58:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2505.315351000263, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64624.35653950001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.81682827600282, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41730.368778000004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.92226000033043, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41610.93636999999, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.642578608419566, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.42305277026975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.85448758325757, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.35526501744602, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:44:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1873.8067115000376, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1959.06081000021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.4497275599612, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.16835068669025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.6132789999283, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.641754999593104, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.815606907763367, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.225090784758686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.124262533905293, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.26305151808329, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 03:38:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18127.93551750019, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1855.636292500094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.99471685199507, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.84331502001562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.33619399961754, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.36902550022205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.28410064460172, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.833273623129934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.7595891936329, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.143455619047463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-01 04:17:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -90134,668 +90134,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717399193605, + "date": 1717399231380, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2055.8448110000427, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64884.99480949997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.56332087997241, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42128.04296355933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.486727499934204, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41348.1672575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.734279113870905, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.04445164856457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.832082933123225, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.1711670945071, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 255519.67455000023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1872.9667494999376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 241553.6490950207, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.80009647333888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242416.74192250002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.514852499840345, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.40638422233403, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.896918944598946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.7109178209585, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.185281566078396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:18:51 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6064.189869000074, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16066.287050499795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.64453962000455, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 214.53785313599292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.92926149986124, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.64887800001816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.951733724818304, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.22571279814014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.57946328211994, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 136.22333940196228, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:55:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3678.7117545, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2044.9386479999703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.70120577671452, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.02566830661453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.0109705006762, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.38501999982691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.657908070816337, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.696571175341772, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.014463232733515, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.787949541710848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18655.240721000155, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3678.8398609996875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1362.2770440846543, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.34025776999928, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 262.8703115005919, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.8414980005291, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.27636172303048, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.572533695436558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.06888759254596, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.823086931210003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:05:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2627.0912965001116, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19371.059033500387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.82107498667513, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1885.296960885332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.28235999983735, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 294.679543500024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.810647724409144, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.6807658872163, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.80232702724121, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.29431613646037, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6131.326911000315, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78392.3951145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.16482551332592, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58729.79366595998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.90202300003511, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71717.63896000016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.78329706453252, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.47657554120445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.25952119505826, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.3877575028016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:48:13 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69814.66886199996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17474.08793500017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31225.528803002006, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.71194765666405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30391.95524150091, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.9115324996783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.78759748293524, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.1916789198398, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.26347417634994, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 151.33294772111572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7891.291946000024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6135.173119999763, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.65347058000043, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.64072813999398, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.4609829999872, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.32800649999626, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.602982839799644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.79748169149696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.20217650230741, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.28670736210095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:30:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6055.366905000483, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5498.607529500077, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.88969452534124, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.54350937465642, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.752284000293, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.12448450006923, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.74072664358886, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.94940404538018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.74285928966727, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.06768571114641, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1955.5083455002205, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5138.209700498919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.00293331333948, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.15705355996519, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.8722764995473, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.36117849939183, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.454777052584479, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.58822536454887, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.78043380833142, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.174410694362553, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15412.548571500338, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1954.9747514997762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 210.16727801800093, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.62923728671558, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 163.89254700015954, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.56203150001238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.69744632406882, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.507764798042727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.96581216853258, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.798509336531822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16840.66650349996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6139.169226500144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 220.79072606800037, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.7274202033368, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.54757699972834, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.7162074999469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.65848210703913, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.97776774062294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.03363637500365, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.54337463571262, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:10:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2459.1634369999156, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2492.9172754996216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.2581605826866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.6331017546569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.49792600008732, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.75818999961848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.410164495557872, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.530413646906446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.566398486916665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.677904326030006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5129.752017000101, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6414.464983000016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.16210111339994, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.29079833333799, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.41876699945715, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72.604216000002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.49815985398641, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.471566068882154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.062567575039832, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.185656056709675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:50:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5430.696962000184, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1966.2476574999346, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.9749076986797, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.28211399331971, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.23178100014047, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.71289049968618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.03292704847647, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.294746986263503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.00412580783481, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.34823025852795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6086.761909499955, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6110.828845000469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.48856736667524, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.98278304797957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.618281999994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.8971290002446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.758815015615966, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.213019199453505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.57903657881813, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.04679258427369, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:23:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6387.866823999957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256072.86408100027, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.88967173333322, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242264.19104754063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.6485330000437, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243567.29364700004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.42783907498824, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.00752805688872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.13229719876569, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.89142780360217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:00 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78795.04042050007, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6088.1548394999645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59064.32769362, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.75784957333812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72272.24899349995, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.68618349997814, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.32638058805797, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.65933069680194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.77161860426683, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.60653004686339, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:03:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64624.35653950001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2650.027239999872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41730.368778000004, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.68155883867075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41610.93636999999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.72223100009069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.42305277026975, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 20.00516919578365, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.35526501744602, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.887225188275885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:38:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1959.06081000021, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71232.04859200087, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.16835068669025, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32204.601403142005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.641754999593104, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31570.015363499806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.225090784758686, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.88114407327933, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.26305151808329, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.25227204735702, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1855.636292500094, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7951.652094499991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.84331502001562, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.5392373279974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.36902550022205, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.66923750009119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.833273623129934, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.64349126188311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.143455619047463, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.282586737329886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -90816,2032 +90816,2032 @@ window.BENCHMARK_DATA = { "timestamp": "2024-05-31T14:44:40Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" }, - "date": 1717399231380, + "date": 1717399352850, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64884.99480949997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65653.67689900006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42128.04296355933, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42922.725588200665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41348.1672575, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42452.05514099996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.04445164856457, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.64114655299583, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.1711670945071, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.02451028525023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1872.9667494999376, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18028.672680499767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.80009647333888, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.00184653931015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.514852499840345, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.5591454998612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.896918944598946, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.30636013642217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.185281566078396, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.20055413725726, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16066.287050499795, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16051.454757500323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 214.53785313599292, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 212.19625483396766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.64887800001816, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 165.96760799984622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.22571279814014, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.73121629907203, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 136.22333940196228, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.53129105893592, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2044.9386479999703, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1950.291978500445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.02566830661453, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.71114405334265, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.38501999982691, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.57355600028313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.696571175341772, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.472545235675167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.787949541710848, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.795277081505917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:49 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3678.8398609996875, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5484.436976999859, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.34025776999928, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.11239930000252, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.8414980005291, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.24708299949998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.572533695436558, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.14093455176976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.823086931210003, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.30365630337018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19371.059033500387, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2635.7566330002555, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1885.296960885332, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.5081048280105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 294.679543500024, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.22319499954756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.6807658872163, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.82901910338906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.29431613646037, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.959692317755575, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:06:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78392.3951145, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6082.61340899935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58729.79366595998, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.66617888800829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71717.63896000016, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.32013650014414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.47657554120445, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.92637214879297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.3877575028016, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.96357205752251, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17474.08793500017, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259887.42452949987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.71194765666405, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244297.01181745736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.9115324996783, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246612.01731449977, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.1916789198398, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.85346358159815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 151.33294772111572, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.03277572366092, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6135.173119999763, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2059.9316304997046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.64072813999398, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.80629287329187, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.32800649999626, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.89324199983457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.79748169149696, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.75254192367904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.28670736210095, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.818274649825822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5498.607529500077, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6147.821475499768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.54350937465642, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.80471219998556, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.12448450006923, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.50420700013638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.94940404538018, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.79804842406723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.06768571114641, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.293424371762335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:30:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5138.209700498919, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5145.081876000404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.15705355996519, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.81483949998074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.36117849939183, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.30737250068341, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.58822536454887, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.537024166646766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.174410694362553, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.121061388982348, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:51:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1954.9747514997762, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2503.5247364999123, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.62923728671558, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.9130924946609, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.56203150001238, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.03955550011233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.507764798042727, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.600041352456742, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.798509336531822, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.867856167450782, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6139.169226500144, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1855.5622699996093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.7274202033368, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.49166844330588, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.7162074999469, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.94448049998391, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.97776774062294, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.805301836104457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.54337463571262, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.068215507272877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2492.9172754996216, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19543.508186999134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 117.6331017546569, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1963.6742602433683, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.75818999961848, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 309.27946499923564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.530413646906446, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.62565076317185, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.677904326030006, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.1995780301561, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6414.464983000016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3680.0972384999113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.29079833333799, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.19777263336314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72.604216000002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.91298149926297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.471566068882154, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.69263322334179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.185656056709675, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.913538028351816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1966.2476574999346, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6405.955829499988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.28211399331971, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.1068306533328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.71289049968618, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.63821700006656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.294746986263503, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.43517654688617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.34823025852795, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.149975453829946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6110.828845000469, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1969.1396335001627, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.98278304797957, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.93283122000018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.8971290002446, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.59165750009197, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.213019199453505, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.23605530753715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.04679258427369, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.304046444367396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:58:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 256072.86408100027, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70070.83320249966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242264.19104754063, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31352.185162739992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243567.29364700004, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30375.764695000726, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.00752805688872, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.05356530653393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.89142780360217, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.46995719542804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6088.1548394999645, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6088.025100000095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.75784957333812, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.1539829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.68618349997814, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.05334449990187, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.65933069680194, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.04965201487477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.60653004686339, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.64856071417043, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2650.027239999872, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7964.07807549997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.68155883867075, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.12438900266352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.72223100009069, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.33693000006497, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 20.00516919578365, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.697434615094835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.887225188275885, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.35603009244823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71232.04859200087, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6083.727579999959, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 32204.601403142005, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.27163534000054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31570.015363499806, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.52642900008323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.88114407327933, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.777392715063506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.25227204735702, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.60346729963552, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:40:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7951.652094499991, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78119.36454800003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.5392373279974, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58545.294935608, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.66923750009119, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71478.12105899994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.64349126188311, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.41038133516298, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.282586737329886, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.31888471951564, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Domenic Barbuzzi", - "username": "dbarbuzzi", - "email": "dbarbuzzi@gmail.com" + "name": "Robert Shaw", + "username": "robertgshaw2-neuralmagic", + "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "019467500eee9542833aedd6e2c17bd62ffa4b40", - "message": "Handle server startup failure in __enter__ (#274)\n\nWith a context manager class, the `__exit__` method is not called when\r\nan exception is raised during the context manager’s `__enter__` method.\r\nThis PR addresses that by manually calling that method if an exception\r\nis raised.", - "timestamp": "2024-05-31T14:44:40Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/019467500eee9542833aedd6e2c17bd62ffa4b40" + "id": "fec35636296a2dfad0880699e061918f64a3b9d6", + "message": "Upstream sync 2024 05 19 (#249)\n\nUpstream sync 2024 05 25 (#249)\r\n\r\nSUMMARY:\r\nMerge commits from\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nto\r\nhttps://github.com/vllm-project/vllm/commit/f68470e803df575f294e67167b4b83adfe004cfa\r\n\r\nNote that\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nis NOT included in this merge.\r\n\r\n---\r\n\r\n
\r\n\r\n PR Checklist (Click to Expand) \r\n\r\n

Thank you for your contribution to vLLM! Before submitting the pull\r\nrequest, please ensure the PR meets the following criteria. This helps\r\nvLLM maintain the code quality and improve the efficiency of the review\r\nprocess.

\r\n\r\n

PR Title and Classification

\r\n

Only specific types of PRs will be reviewed. The PR title is prefixed\r\nappropriately to indicate the type of change. Please use one of the\r\nfollowing:

\r\n
    \r\n
  • [Bugfix] for bug fixes.
  • \r\n
  • [CI/Build] for build or continuous integration\r\nimprovements.
  • \r\n
  • [Doc] for documentation fixes and improvements.
  • \r\n
  • [Model] for adding a new model or improving an existing\r\nmodel. Model name should appear in the title.
  • \r\n
  • [Frontend] For changes on the vLLM frontend (e.g.,\r\nOpenAI API server, LLM class, etc.)
  • \r\n
  • [Kernel] for changes affecting CUDA kernels or other\r\ncompute kernels.
  • \r\n
  • [Core] for changes in the core vLLM logic (e.g.,\r\nLLMEngine, AsyncLLMEngine,\r\nScheduler, etc.)
  • \r\n
  • [Hardware][Vendor] for hardware-specific changes.\r\nVendor name should appear in the prefix (e.g.,\r\n[Hardware][AMD]).
  • \r\n
  • [Misc] for PRs that do not fit the above categories.\r\nPlease use this sparingly.
  • \r\n
\r\n

Note: If the PR spans more than one category, please\r\ninclude all relevant prefixes.

\r\n\r\n

Code Quality

\r\n\r\n

The PR need to meet the following code quality standards:

\r\n\r\n
    \r\n
  • We adhere to Google Python\r\nstyle guide and Google C++\r\nstyle guide.
  • \r\n
  • Pass all linter checks. Please use format.sh\r\nto format your code.
  • \r\n
  • The code need to be well-documented to ensure future contributors\r\ncan easily understand the code.
  • \r\n
  • Include sufficient tests to ensure the project to stay correct and\r\nrobust. This includes both unit tests and integration tests.
  • \r\n
  • Please add documentation to docs/source/ if the PR\r\nmodifies the user-facing behaviors of vLLM. It helps vLLM user\r\nunderstand and utilize the new features or changes.
  • \r\n
\r\n\r\n

Notes for Large Changes

\r\n

Please keep the changes as concise as possible. For major\r\narchitectural changes (>500 LOC excluding kernel/data/config/test), we\r\nwould expect a GitHub issue (RFC) discussing the technical design and\r\njustification. Otherwise, we will tag it with rfc-required\r\nand might not go through the PR.

\r\n\r\n

What to Expect for the Reviews

\r\n\r\n

The goal of the vLLM team is to be a transparent reviewing\r\nmachine. We would like to make the review process transparent and\r\nefficient and make sure no contributor feel confused or frustrated.\r\nHowever, the vLLM team is small, so we need to prioritize some PRs over\r\nothers. Here is what you can expect from the review process:

\r\n\r\n
    \r\n
  • After the PR is submitted, the PR will be assigned to a reviewer.\r\nEvery reviewer will pick up the PRs based on their expertise and\r\navailability.
  • \r\n
  • After the PR is assigned, the reviewer will provide status update\r\nevery 2-3 days. If the PR is not reviewed within 7 days, please feel\r\nfree to ping the reviewer or the vLLM team.
  • \r\n
  • After the review, the reviewer will put an \r\naction-required label on the PR if there are changes required.\r\nThe contributor should address the comments and ping the reviewer to\r\nre-review the PR.
  • \r\n
  • Please respond to all comments within a reasonable time frame. If a\r\ncomment isn't clear or you disagree with a suggestion, feel free to ask\r\nfor clarification or discuss the suggestion.\r\n
  • \r\n
\r\n\r\n

Thank You

\r\n\r\n

Finally, thank you for taking the time to read these guidelines and\r\nfor your interest in contributing to vLLM. Your contributions make vLLM\r\na great tool for everyone!

\r\n\r\n\r\n
\r\n\r\n---------\r\n\r\nSigned-off-by: kerthcet \r\nCo-authored-by: zhaoyang-star \r\nCo-authored-by: Cyrus Leung \r\nCo-authored-by: Simon Mo \r\nCo-authored-by: Cade Daniel \r\nCo-authored-by: Noam Gat \r\nCo-authored-by: Philipp Moritz \r\nCo-authored-by: youkaichao \r\nCo-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>\r\nCo-authored-by: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>\r\nCo-authored-by: leiwen83 \r\nCo-authored-by: Lei Wen \r\nCo-authored-by: Cody Yu \r\nCo-authored-by: SangBin Cho \r\nCo-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>\r\nCo-authored-by: Woosuk Kwon \r\nCo-authored-by: Antoni Baum \r\nCo-authored-by: alexm-nm <59768536+alexm-nm@users.noreply.github.com>\r\nCo-authored-by: Mahmoud Ashraf \r\nCo-authored-by: Michael Goin \r\nCo-authored-by: kliuae <17350011+kliuae@users.noreply.github.com>\r\nCo-authored-by: miloice \r\nCo-authored-by: Hao Zhang <152229491+sfc-gh-hazhang@users.noreply.github.com>\r\nCo-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>\r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Allen.Dou \r\nCo-authored-by: Kunshang Ji \r\nCo-authored-by: Steve Grubb \r\nCo-authored-by: heeju-kim2 <157340754+heeju-kim2@users.noreply.github.com>\r\nCo-authored-by: Chang Su \r\nCo-authored-by: Yikang Shen \r\nCo-authored-by: Swapnil Parekh \r\nCo-authored-by: Sanger Steel \r\nCo-authored-by: Stephen Krider <72541272+skrider@users.noreply.github.com>\r\nCo-authored-by: LiuXiaoxuanPKU \r\nCo-authored-by: Zhuohan Li \r\nCo-authored-by: Kuntai Du \r\nCo-authored-by: Nick Hill \r\nCo-authored-by: SAHIL SUNEJA \r\nCo-authored-by: zifeitong \r\nCo-authored-by: Alex Wu \r\nCo-authored-by: Cade Daniel \r\nCo-authored-by: Jinzhen Lin \r\nCo-authored-by: Alex Wu \r\nCo-authored-by: Pierre Dulac \r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: Silencio <19430328+Silencioo@users.noreply.github.com>\r\nCo-authored-by: Silencio \r\nCo-authored-by: Tyler Michael Smith \r\nCo-authored-by: Kante Yin \r\nCo-authored-by: bofeng huang \r\nCo-authored-by: eigenLiu <33959526+eigen2017@users.noreply.github.com>\r\nCo-authored-by: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com>\r\nCo-authored-by: Ubuntu \r\nCo-authored-by: Domenic Barbuzzi ", + "timestamp": "2024-06-03T15:22:54Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/fec35636296a2dfad0880699e061918f64a3b9d6" }, - "date": 1717399352850, + "date": 1717436914272, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65653.67689900006, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1826.7876810000416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42922.725588200665, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.30180574333039, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42452.05514099996, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.09178000006432, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.64114655299583, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.494657948635467, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.02451028525023, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.822835040136072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:39:39 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18028.672680499767, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2577.4123235000843, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.00184653931015, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.01311407067139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.5591454998612, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.47934300036286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.30636013642217, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.917851642837732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.20055413725726, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.005374696378382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:11:08 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16051.454757500323, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7477.170930999932, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 212.19625483396766, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.60422404133442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 165.96760799984622, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.69303699999546, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.73121629907203, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.154255656328566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.53129105893592, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.70570047176915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:45:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1950.291978500445, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5792.872616500063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.71114405334265, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.60485665071124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.57355600028313, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.6595430001762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.472545235675167, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.57606963795908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.795277081505917, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.77184932528473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:58:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5484.436976999859, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6359.727259500005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.11239930000252, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.62061250000306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.24708299949998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.44836100002294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.14093455176976, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.14834417896245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.30365630337018, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.866016742486394, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:32:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2635.7566330002555, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2027.7279659999294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.5081048280105, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.131375346716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.22319499954756, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.80655000011757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.82901910338906, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.574741119740576, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.959692317755575, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.62783674047078, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 04:04:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6082.61340899935, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5086.0681250005655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.66617888800829, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.60802515331791, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.32013650014414, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.84449399956793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.92637214879297, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.191637028120056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.96357205752251, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.83664939242367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:00:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259887.42452949987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13438.462082000115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244297.01181745736, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 207.69161295667195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246612.01731449977, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 162.68896749988926, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.85346358159815, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.1323195408694, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.03277572366092, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 107.03430961238783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:19:39 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2059.9316304997046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2410.390771999573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.80629287329187, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.01512438667001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.89324199983457, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.54676449955878, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.75254192367904, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.631069397802065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.818274649825822, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.065749860137487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:51:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6147.821475499768, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58115.24761200053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.80471219998556, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24693.190178945297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.50420700013638, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22696.127297999737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.79804842406723, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.07596480055477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.293424371762335, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.91163983201352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:49:09 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5145.081876000404, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3638.090063499476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.81483949998074, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141.89584527663706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.30737250068341, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.82084300048155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.537024166646766, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.30408247226253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.121061388982348, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.61374608368908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:53:20 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2503.5247364999123, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1942.1043275001466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.9130924946609, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.1003962000262, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.03955550011233, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.76818800017645, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.600041352456742, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.053056741675453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.867856167450782, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.178593173806847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:38:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1855.5622699996093, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1919.03614299963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.49166844330588, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.22951738337119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.94448049998391, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.753891499440215, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.805301836104457, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.171030743815644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.068215507272877, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.444907479944426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:32:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19543.508186999134, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6011.056845000212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1963.6742602433683, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.51665396332828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 309.27946499923564, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76.40881049997006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.62565076317185, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.447607174651544, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.1995780301561, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.04213773936851, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 05:08:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3680.0972384999113, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6010.300804999929, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.19777263336314, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.94023836000534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.91298149926297, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.4834220000148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.69263322334179, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.24737687956218, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.913538028351816, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.325393796500656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:24:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6405.955829499988, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5231.110777500362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.1068306533328, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.19965554800243, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.63821700006656, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 169.80007999973168, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.43517654688617, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.8138107661964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.149975453829946, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.10993344656062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:17:47 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1969.1396335001627, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6102.600778999886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.93283122000018, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.56953884001207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.59165750009197, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.45767050031282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.23605530753715, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.50484903749174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.304046444367396, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.01785497536404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:26:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70070.83320249966, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13024.152943000445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31352.185162739992, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 166.09906876733658, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30375.764695000726, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.501374999767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.05356530653393, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 116.45557543230827, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.46995719542804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.93449016074172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 04:42:04 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6088.025100000095, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73612.5902235001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.1539829, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54508.354054376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.05334449990187, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66708.06425700017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.04965201487477, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.84795469274688, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.64856071417043, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.49598703678805, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:56:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7964.07807549997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57355.77882799987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.12438900266352, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35571.797603253995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.33693000006497, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35350.50049350002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.697434615094835, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.58072383981926, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.35603009244823, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.55851647084337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:31:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6083.727579999959, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 249174.2924559994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.27163534000054, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235009.30902119138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.52642900008323, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 236050.11426999955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.777392715063506, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.79708311294138, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.60346729963552, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.43778767154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 02:24:15 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78119.36454800003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11550.040795000314, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58545.294935608, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.73528062333148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71478.12105899994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.74351900016336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.41038133516298, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.77569215848233, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.31888471951564, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.93932269039736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 03:04:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Robert Shaw", - "username": "robertgshaw2-neuralmagic", - "email": "114415538+robertgshaw2-neuralmagic@users.noreply.github.com" + "name": "dhuangnm", + "username": "dhuangnm", + "email": "74931910+dhuangnm@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "fec35636296a2dfad0880699e061918f64a3b9d6", - "message": "Upstream sync 2024 05 19 (#249)\n\nUpstream sync 2024 05 25 (#249)\r\n\r\nSUMMARY:\r\nMerge commits from\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nto\r\nhttps://github.com/vllm-project/vllm/commit/f68470e803df575f294e67167b4b83adfe004cfa\r\n\r\nNote that\r\nhttps://github.com/vllm-project/vllm/commit/c7f2cf2b7f67bce5842fedfdba508440fe257375\r\nis NOT included in this merge.\r\n\r\n---\r\n\r\n
\r\n\r\n PR Checklist (Click to Expand) \r\n\r\n

Thank you for your contribution to vLLM! Before submitting the pull\r\nrequest, please ensure the PR meets the following criteria. This helps\r\nvLLM maintain the code quality and improve the efficiency of the review\r\nprocess.

\r\n\r\n

PR Title and Classification

\r\n

Only specific types of PRs will be reviewed. The PR title is prefixed\r\nappropriately to indicate the type of change. Please use one of the\r\nfollowing:

\r\n
    \r\n
  • [Bugfix] for bug fixes.
  • \r\n
  • [CI/Build] for build or continuous integration\r\nimprovements.
  • \r\n
  • [Doc] for documentation fixes and improvements.
  • \r\n
  • [Model] for adding a new model or improving an existing\r\nmodel. Model name should appear in the title.
  • \r\n
  • [Frontend] For changes on the vLLM frontend (e.g.,\r\nOpenAI API server, LLM class, etc.)
  • \r\n
  • [Kernel] for changes affecting CUDA kernels or other\r\ncompute kernels.
  • \r\n
  • [Core] for changes in the core vLLM logic (e.g.,\r\nLLMEngine, AsyncLLMEngine,\r\nScheduler, etc.)
  • \r\n
  • [Hardware][Vendor] for hardware-specific changes.\r\nVendor name should appear in the prefix (e.g.,\r\n[Hardware][AMD]).
  • \r\n
  • [Misc] for PRs that do not fit the above categories.\r\nPlease use this sparingly.
  • \r\n
\r\n

Note: If the PR spans more than one category, please\r\ninclude all relevant prefixes.

\r\n\r\n

Code Quality

\r\n\r\n

The PR need to meet the following code quality standards:

\r\n\r\n
    \r\n
  • We adhere to Google Python\r\nstyle guide and Google C++\r\nstyle guide.
  • \r\n
  • Pass all linter checks. Please use format.sh\r\nto format your code.
  • \r\n
  • The code need to be well-documented to ensure future contributors\r\ncan easily understand the code.
  • \r\n
  • Include sufficient tests to ensure the project to stay correct and\r\nrobust. This includes both unit tests and integration tests.
  • \r\n
  • Please add documentation to docs/source/ if the PR\r\nmodifies the user-facing behaviors of vLLM. It helps vLLM user\r\nunderstand and utilize the new features or changes.
  • \r\n
\r\n\r\n

Notes for Large Changes

\r\n

Please keep the changes as concise as possible. For major\r\narchitectural changes (>500 LOC excluding kernel/data/config/test), we\r\nwould expect a GitHub issue (RFC) discussing the technical design and\r\njustification. Otherwise, we will tag it with rfc-required\r\nand might not go through the PR.

\r\n\r\n

What to Expect for the Reviews

\r\n\r\n

The goal of the vLLM team is to be a transparent reviewing\r\nmachine. We would like to make the review process transparent and\r\nefficient and make sure no contributor feel confused or frustrated.\r\nHowever, the vLLM team is small, so we need to prioritize some PRs over\r\nothers. Here is what you can expect from the review process:

\r\n\r\n
    \r\n
  • After the PR is submitted, the PR will be assigned to a reviewer.\r\nEvery reviewer will pick up the PRs based on their expertise and\r\navailability.
  • \r\n
  • After the PR is assigned, the reviewer will provide status update\r\nevery 2-3 days. If the PR is not reviewed within 7 days, please feel\r\nfree to ping the reviewer or the vLLM team.
  • \r\n
  • After the review, the reviewer will put an \r\naction-required label on the PR if there are changes required.\r\nThe contributor should address the comments and ping the reviewer to\r\nre-review the PR.
  • \r\n
  • Please respond to all comments within a reasonable time frame. If a\r\ncomment isn't clear or you disagree with a suggestion, feel free to ask\r\nfor clarification or discuss the suggestion.\r\n
  • \r\n
\r\n\r\n

Thank You

\r\n\r\n

Finally, thank you for taking the time to read these guidelines and\r\nfor your interest in contributing to vLLM. Your contributions make vLLM\r\na great tool for everyone!

\r\n\r\n\r\n
\r\n\r\n---------\r\n\r\nSigned-off-by: kerthcet \r\nCo-authored-by: zhaoyang-star \r\nCo-authored-by: Cyrus Leung \r\nCo-authored-by: Simon Mo \r\nCo-authored-by: Cade Daniel \r\nCo-authored-by: Noam Gat \r\nCo-authored-by: Philipp Moritz \r\nCo-authored-by: youkaichao \r\nCo-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>\r\nCo-authored-by: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>\r\nCo-authored-by: leiwen83 \r\nCo-authored-by: Lei Wen \r\nCo-authored-by: Cody Yu \r\nCo-authored-by: SangBin Cho \r\nCo-authored-by: DefTruth <31974251+DefTruth@users.noreply.github.com>\r\nCo-authored-by: Woosuk Kwon \r\nCo-authored-by: Antoni Baum \r\nCo-authored-by: alexm-nm <59768536+alexm-nm@users.noreply.github.com>\r\nCo-authored-by: Mahmoud Ashraf \r\nCo-authored-by: Michael Goin \r\nCo-authored-by: kliuae <17350011+kliuae@users.noreply.github.com>\r\nCo-authored-by: miloice \r\nCo-authored-by: Hao Zhang <152229491+sfc-gh-hazhang@users.noreply.github.com>\r\nCo-authored-by: Dash Desai <1723932+iamontheinet@users.noreply.github.com>\r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Aurick Qiao \r\nCo-authored-by: Allen.Dou \r\nCo-authored-by: Kunshang Ji \r\nCo-authored-by: Steve Grubb \r\nCo-authored-by: heeju-kim2 <157340754+heeju-kim2@users.noreply.github.com>\r\nCo-authored-by: Chang Su \r\nCo-authored-by: Yikang Shen \r\nCo-authored-by: Swapnil Parekh \r\nCo-authored-by: Sanger Steel \r\nCo-authored-by: Stephen Krider <72541272+skrider@users.noreply.github.com>\r\nCo-authored-by: LiuXiaoxuanPKU \r\nCo-authored-by: Zhuohan Li \r\nCo-authored-by: Kuntai Du \r\nCo-authored-by: Nick Hill \r\nCo-authored-by: SAHIL SUNEJA \r\nCo-authored-by: zifeitong \r\nCo-authored-by: Alex Wu \r\nCo-authored-by: Cade Daniel \r\nCo-authored-by: Jinzhen Lin \r\nCo-authored-by: Alex Wu \r\nCo-authored-by: Pierre Dulac \r\nCo-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com>\r\nCo-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>\r\nCo-authored-by: Silencio <19430328+Silencioo@users.noreply.github.com>\r\nCo-authored-by: Silencio \r\nCo-authored-by: Tyler Michael Smith \r\nCo-authored-by: Kante Yin \r\nCo-authored-by: bofeng huang \r\nCo-authored-by: eigenLiu <33959526+eigen2017@users.noreply.github.com>\r\nCo-authored-by: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com>\r\nCo-authored-by: Ubuntu \r\nCo-authored-by: Domenic Barbuzzi ", - "timestamp": "2024-06-03T15:22:54Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/fec35636296a2dfad0880699e061918f64a3b9d6" + "id": "fdd9a3143946895e4cec17ba9db28937d48cdfd9", + "message": "add latest tag for release docker image (#279)\n\nNeed to tag the latest for the release docker image\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm ", + "timestamp": "2024-06-03T20:14:54Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9" }, - "date": 1717436914272, + "date": 1717486794145, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1826.7876810000416, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1968.3447554998565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.30180574333039, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.80455332328651, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.09178000006432, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.64442700046857, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.494657948635467, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.541103052491863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.822835040136072, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.871362886241304, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:03:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2577.4123235000843, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6106.36480050016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.01311407067139, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.76388846999937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.47934300036286, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.77009599999474, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.917851642837732, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.12390406482, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.005374696378382, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.70404936507413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:35:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7477.170930999932, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2072.402735500418, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.60422404133442, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.25535124000089, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.69303699999546, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.30683899984433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.154255656328566, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.805719611744376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.70570047176915, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.947181879927312, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:03:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5792.872616500063, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3709.8409445006837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.60485665071124, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.5810374200385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.6595430001762, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.83627799979877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.57606963795908, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.84327153864837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.77184932528473, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.100691502900332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:30:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6359.727259500005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15567.694932500217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.62061250000306, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 211.95435407732356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.44836100002294, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.61118799983888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.14834417896245, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.28643066526487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.866016742486394, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.88409230534404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2027.7279659999294, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1885.0982009998916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.131375346716, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.19699156666744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.80655000011757, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.964666000065336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.574741119740576, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.972681839311665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.62783674047078, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.240989983625319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:23:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5086.0681250005655, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1985.1058935000765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.60802515331791, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.90430167327699, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.84449399956793, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.71939650020795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.191637028120056, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.379385445370405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.83664939242367, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.469891548717342, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:23:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13438.462082000115, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67769.83289000054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 207.69161295667195, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29962.627152307323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 162.68896749988926, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 28717.597102499894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.1323195408694, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 190.42693975378825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 107.03430961238783, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 201.70061792719784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:41:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2410.390771999573, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62892.62237150001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.01512438667001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40386.830530422, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.54676449955878, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40184.519388000124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.631069397802065, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.63145586858826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.065749860137487, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.68223204445646, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:09:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58115.24761200053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2502.4847879994923, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24693.190178945297, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.04334467331257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 22696.127297999737, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.55580650034244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.07596480055477, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.610256669780377, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.91163983201352, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.822594410864983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:12:08 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3638.090063499476, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5157.87423349866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 141.89584527663706, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.3476442533065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.82084300048155, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.08617849936127, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.30408247226253, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.67354362422549, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.61374608368908, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.275710766273264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 14:54:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1942.1043275001466, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 257234.431072, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.1003962000262, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243333.00650803535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.76818800017645, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243501.93066449993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.053056741675453, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.05580158040362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.178593173806847, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.11879649065315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:57:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1919.03614299963, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5429.674158000125, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.22951738337119, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.65393957596098, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.753891499440215, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.55695099993318, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.171030743815644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.96852669778389, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.444907479944426, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.996983684938925, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:29:15 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6011.056845000212, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2651.403954000216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.51665396332828, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.47014652266928, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 76.40881049997006, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.83390100036559, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.447607174651544, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.836291896778306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.04213773936851, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.805192921584684, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:27:53 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6010.300804999929, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6169.347012999879, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.94023836000534, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.59407726000487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.4834220000148, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.40293249979368, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.24737687956218, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.93313201739429, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.325393796500656, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.38531133968375, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 12:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5231.110777500362, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79578.35537449978, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.19965554800243, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59577.81482764667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 169.80007999973168, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72953.15405799988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.8138107661964, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.84221377872534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.10993344656062, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.01800222368406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:02:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6102.600778999886, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6105.109807499957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.56953884001207, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.2460156533369, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.45767050031282, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.1446569999298, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.50484903749174, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.946839869342064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.01785497536404, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.72338793562835, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:20:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13024.152943000445, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17937.872257999516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 166.09906876733658, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1133.8330244626788, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.501374999767, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.2085975000664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 116.45557543230827, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.57282078697003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.93449016074172, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.8903228539812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-03 15:37:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73612.5902235001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16867.69092550003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54508.354054376, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 226.9577794733262, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66708.06425700017, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 177.5287084997217, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.84795469274688, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 138.74064695416712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.49598703678805, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.08242617070047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:36:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57355.77882799987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7964.563253999927, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35571.797603253995, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.58499460933248, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35350.50049350002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.4896505001334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.58072383981926, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.35169198589472, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.55851647084337, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.84237228350271, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:11:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 249174.2924559994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6431.401653999956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235009.30902119138, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.18634970667638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 236050.11426999955, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.29239400002507, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.79708311294138, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.63681207882916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.43778767154, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.317072432491216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 13:50:59 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11550.040795000314, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5993.155903500337, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.73528062333148, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.28509318669967, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.74351900016336, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.65418249974755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.77569215848233, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.36501160974001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.93932269039736, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.33126700501665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-03 14:16:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -92862,668 +92862,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-03T20:14:54Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9" }, - "date": 1717486794145, + "date": 1717486834964, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1968.3447554998565, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6440.998319999949, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.80455332328651, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.74126664666437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.64442700046857, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.40421099998184, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.541103052491863, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.68560751570767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.871362886241304, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.37614568889179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:54 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6106.36480050016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6106.371244000002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.76388846999937, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.95562292398245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.77009599999474, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.63212449989078, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.12390406482, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.39298069218799, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.70404936507413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.46992398190285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2072.402735500418, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1885.2783004999765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.25535124000089, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.00666148663913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.30683899984433, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.026208000432234, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.805719611744376, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.060310247904635, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.947181879927312, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.305141051714045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:38 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3709.8409445006837, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2532.953448999706, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.5810374200385, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.43352627734082, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.83627799979877, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.26540150037908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.84327153864837, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.779799559480068, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.100691502900332, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.013203426807557, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:50 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15567.694932500217, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1991.5965344998767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 211.95435407732356, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.52473940003863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.61118799983888, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.75204699938695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.28643066526487, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.398765246590358, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.88409230534404, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.453427574962406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1885.0982009998916, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17652.83625300026, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.19699156666744, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 224.15647835066798, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.964666000065336, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 177.60944749988994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.972681839311665, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.21477302456094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.240989983625319, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 150.62782272179493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1985.1058935000765, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65234.32594349993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.90430167327699, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42344.713461258674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.71939650020795, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41879.766031500025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.379385445370405, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.30181319539702, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.469891548717342, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.8601412556169, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67769.83289000054, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5174.499678500069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29962.627152307323, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.2496823665909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 28717.597102499894, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63.416820999918855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 190.42693975378825, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.91975227240757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 201.70061792719784, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.330177188455274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62892.62237150001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6202.171018999934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40386.830530422, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.24800641333363, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40184.519388000124, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.38204749990473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.63145586858826, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.04272655003397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.68223204445646, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.523500103738975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2502.4847879994923, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6108.172721500068, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.04334467331257, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.90046643668515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.55580650034244, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.81508199984455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.610256669780377, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.19418075671441, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.822594410864983, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.85883292820574, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5157.87423349866, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 260871.4947394999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.3476442533065, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245982.4186649913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.08617849936127, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 247559.58809399977, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.67354362422549, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.15954414937362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.275710766273264, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.58994837999971, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 257234.431072, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7992.203209499962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243333.00650803535, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.57673923466547, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243501.93066449993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 134.29543700010527, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.05580158040362, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.85843149885409, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.11879649065315, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.675191881449955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:00 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5429.674158000125, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1968.6618504997568, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.65393957596098, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.32872556998578, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.55695099993318, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.17519899958279, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.96852669778389, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.543744792374257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.996983684938925, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.818288283459308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:53:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2651.403954000216, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5493.706313000075, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.47014652266928, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.78887714270485, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.83390100036559, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.23564850064577, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.836291896778306, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.06861954781493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.805192921584684, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.19143516996267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6169.347012999879, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2072.734379499707, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.59407726000487, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.18704796005962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.40293249979368, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.60844400035057, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.93313201739429, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.796053535716636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.38531133968375, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.938045209799384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:50 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79578.35537449978, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81030.78548899999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59577.81482764667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60722.77092625867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72953.15405799988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 74389.78645499992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.84221377872534, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.50143122540763, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.01800222368406, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.67409435002708, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6105.109807499957, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15800.433527000223, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.2460156533369, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.21391819934192, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.1446569999298, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.30661549981596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.946839869342064, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.86957135761493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.72338793562835, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.48619996002597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:48 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17937.872257999516, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2631.345844000407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1133.8330244626788, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.99958248667583, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.2085975000664, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.60364950017174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.57282078697003, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.76793531848235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.8903228539812, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.83319020247417, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16867.69092550003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18187.496590000592, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 226.9577794733262, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1217.637657930638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 177.5287084997217, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 238.2256780001626, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 138.74064695416712, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.7914285805437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.08242617070047, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.79020775133904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7964.563253999927, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3693.1756849999147, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.58499460933248, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.14783481670625, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.4896505001334, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.96243850005703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.35169198589472, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.907789175550537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.84237228350271, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.142332322012734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:54:22 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6431.401653999956, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6129.74736849992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.18634970667638, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.53408270666893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.29239400002507, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.85616800014395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.63681207882916, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.06100588782548, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.317072432491216, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.85566850307434, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:41:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5993.155903500337, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69548.69400049938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.28509318669967, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31010.123360934667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.65418249974755, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29984.455719500147, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.36501160974001, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.6870091569195, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.33126700501665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.79108694443562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -93544,668 +93544,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-03T20:14:54Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9" }, - "date": 1717486834964, + "date": 1717486879069, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6440.998319999949, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5454.024124500393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.74126664666437, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.08615106667276, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.40421099998184, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.68465850045322, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.68560751570767, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.27397344693836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.37614568889179, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.49306999706456, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:38:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6106.371244000002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17870.203808500264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.95562292398245, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.24141396668105, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.63212449989078, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 185.04103700024643, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.39298069218799, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.66718580639292, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.46992398190285, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.2060986545837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:21:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1885.2783004999765, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18723.386613999537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.00666148663913, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1533.9662729066622, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.026208000432234, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 263.36673950027034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.060310247904635, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.0342609946288, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.305141051714045, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.72360159908413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:53:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2532.953448999706, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65868.51025700013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.43352627734082, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42952.089562826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.26540150037908, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42655.900271499944, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.779799559480068, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.90905274563143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.013203426807557, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.3437727937067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:59:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1991.5965344998767, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1888.4365559997605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.52473940003863, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.1348959600103, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.75204699938695, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.01629649952156, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.398765246590358, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.058869849482795, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.453427574962406, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.30148967918306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:47:35 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17652.83625300026, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2633.9105729998664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 224.15647835066798, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.04579430002681, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 177.60944749988994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.63644000019121, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.21477302456094, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.879160254872048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 150.62782272179493, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.896625118275395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:32:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65234.32594349993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5156.289227500565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42344.713461258674, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.94653347339529, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41879.766031500025, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.3701980006881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.30181319539702, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.684926592953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.8601412556169, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.242309537617327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:59:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5174.499678500069, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7986.295465000012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.2496823665909, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.19233767466963, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63.416820999918855, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.38263649994042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.91975227240757, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.88789051677662, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.330177188455274, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.71754815822603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6202.171018999934, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2075.414781500058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.24800641333363, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.51745272664145, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.38204749990473, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.26855750023606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.04272655003397, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.888373822442976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.523500103738975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.977945790411958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:08:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6108.172721500068, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3699.253162501009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.90046643668515, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.4739477399647, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.81508199984455, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.98442349993275, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.19418075671441, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.714238157704624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.85883292820574, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.115445349113315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:16:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 260871.4947394999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259145.63174399995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245982.4186649913, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244845.86074828464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 247559.58809399977, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246173.34830100002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.15954414937362, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.40816401919123, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.58994837999971, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.40508523894187, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7992.203209499962, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78886.52240450005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.57673923466547, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59012.254815819986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 134.29543700010527, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72044.4028244999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.85843149885409, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.02510526002537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.675191881449955, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.75853430024881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:10 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1968.6618504997568, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15903.70668750029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.32872556998578, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 213.13931954268506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.17519899958279, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 167.40585550041942, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.543744792374257, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.85636857822524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.818288283459308, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.88485618959274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:19:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5493.706313000075, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6040.232987499621, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.78887714270485, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.90891652400994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.23564850064577, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.51811549973354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.06861954781493, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.70704003253289, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.19143516996267, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.58010815469473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:52:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2072.734379499707, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6112.463483500051, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.18704796005962, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.97861878998697, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.60844400035057, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.68460800003686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.796053535716636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.88820375273403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.938045209799384, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.82786688691618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81030.78548899999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6182.661464499915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60722.77092625867, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.88058805329638, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 74389.78645499992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.79186150013084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.50143122540763, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.09224326144527, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.67409435002708, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.473210161276306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:26:19 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15800.433527000223, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69734.58482200021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.21391819934192, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31249.82819040137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.30661549981596, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30564.476220500183, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.86957135761493, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.5960953923817, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.48619996002597, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.3323966884214, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:06:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2631.345844000407, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1993.6342444998445, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.99958248667583, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.23711747331497, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.60364950017174, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.73105949944511, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.76793531848235, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.430836418906924, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.83319020247417, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.514256438506221, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:25:09 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18187.496590000592, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6428.612557000008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1217.637657930638, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.42314971332341, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 238.2256780001626, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.48310099999344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.7914285805437, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.62859159317332, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.79020775133904, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.34685298628225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:28:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3693.1756849999147, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1975.9936359996573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.14783481670625, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.27948016336268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.96243850005703, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.18415350010764, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.907789175550537, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.555750750491628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.142332322012734, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.861979915334244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:45:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6129.74736849992, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6117.3322049999115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.53408270666893, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.90177221667757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.85616800014395, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.22100400012278, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.06100588782548, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.319982478706756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.85566850307434, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.79184074360817, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:44:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69548.69400049938, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2506.0447780006143, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31010.123360934667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.720866174676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29984.455719500147, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.5898585003888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.6870091569195, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.703319024563818, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.79108694443562, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.95419341546464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:02:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -94226,1350 +94226,1350 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-03T20:14:54Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9" }, - "date": 1717486879069, + "date": 1717486899331, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5454.024124500393, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5785.776880499725, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.08615106667276, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.92931541067568, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.68465850045322, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.1869515009821, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.27397344693836, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.67676156071734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.49306999706456, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.66763363645848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:54:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17870.203808500264, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55960.890349500005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.24141396668105, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34505.53588642667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 185.04103700024643, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34309.34696250006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.66718580639292, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.76908291292905, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.2060986545837, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.35385544190784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:33:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18723.386613999537, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6048.551416500004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1533.9662729066622, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.46771771000446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 263.36673950027034, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.17563349999546, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.0342609946288, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.69668735925634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.72360159908413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.31330238613524, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:29:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65868.51025700013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1927.273313000569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42952.089562826, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.58514570668073, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42655.900271499944, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.797591000507964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.90905274563143, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.246371649076861, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.3437727937067, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.561082803845053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:02:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1888.4365559997605, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2554.296359000091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.1348959600103, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.69095240934499, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.01629649952156, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.05938650028111, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.058869849482795, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.02549530720225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.30148967918306, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.146301608164475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:54:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2633.9105729998664, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57331.99107350083, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.04579430002681, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24318.87264188733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.63644000019121, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22855.887930999415, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.879160254872048, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.4469279026636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.896625118275395, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.98281539278796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:26:22 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5156.289227500565, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6119.579037499989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.94653347339529, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.05635790664746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.3701980006881, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.90187099979994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.684926592953, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.697206180365264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.242309537617327, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.21231925674887, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7986.295465000012, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6024.258447000079, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.19233767466963, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.78459783999871, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.38263649994042, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.81675449998238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.88789051677662, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.272888658516614, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.71754815822603, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.24340105675399, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:53:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2075.414781500058, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12620.563315500476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.51745272664145, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 167.80186781333032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.26855750023606, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.6815070008961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.888373822442976, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113.07529442774695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.977945790411958, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.00880712124115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:13:48 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3699.253162501009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 250191.674357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.4739477399647, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 236125.77282334934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.98442349993275, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 236989.25359399983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.714238157704624, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.64188447174384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.115445349113315, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.79395197603682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:46:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259145.63174399995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1853.0404435000492, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244845.86074828464, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.11207146999732, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246173.34830100002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.19081950034888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.40816401919123, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.796614877419039, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.40508523894187, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.038254724045085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:42:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78886.52240450005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 74282.63615700007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59012.254815819986, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55073.640529414675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72044.4028244999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67166.00807750001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.02510526002537, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.38270194958277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.75853430024881, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.41100075899841, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:27:18 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15903.70668750029, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2391.326898999523, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 213.13931954268506, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.25296086932576, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 167.40585550041942, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.60975599962694, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.85636857822524, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.76224480426048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.88485618959274, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.083277643203157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:07:19 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6040.232987499621, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1966.413384000134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.90891652400994, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.18339140666407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.51811549973354, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.758860000394634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.70704003253289, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.235902758044592, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.58010815469473, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.286106823696699, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:22:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6112.463483500051, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7498.087665499952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.97861878998697, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.2915395866703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.68460800003686, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.13391999996293, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.88820375273403, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.17845654502635, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.82786688691618, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.71056960190177, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:47:09 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6182.661464499915, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6369.020522500023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.88058805329638, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.99698670667082, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.79186150013084, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.12162149999313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.09224326144527, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.287103541900656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.473210161276306, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.95729965848386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:11:49 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69734.58482200021, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11410.404378999829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31249.82819040137, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 188.55270861600667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30564.476220500183, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.18165550018603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.5960953923817, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.01591651072756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.3323966884214, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.16611659868101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:03:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1993.6342444998445, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12309.922719000042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.23711747331497, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.7500242380041, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.73105949944511, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 154.75827799991748, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.430836418906924, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.52570002857267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.514256438506221, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.80388002495819, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:48:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6428.612557000008, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5111.47914150024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.42314971332341, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.57914246011205, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.48310099999344, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.33024400019349, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.62859159317332, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.41309699864617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.34685298628225, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.981834447155244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:40:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1975.9936359996573, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2054.349911999452, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.27948016336268, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.28242346670837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.18415350010764, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.542881499983196, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.555750750491628, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.664829629106839, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.861979915334244, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.797083416327892, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:20:03 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6117.3322049999115, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3659.3875905009554, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.90177221667757, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.56452539669831, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.22100400012278, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.14418849963113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.319982478706756, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.47520356026676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.79184074360817, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.816849227068797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:18:57 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2506.0447780006143, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5258.380299500459, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.720866174676, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.02851416803605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.5898585003888, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.04110500076786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.703319024563818, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.12063777152027, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.95419341546464, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.32876665983879, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:00:34 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" + "name": "Michael Goin", + "username": "mgoin", + "email": "michael@neuralmagic.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "fdd9a3143946895e4cec17ba9db28937d48cdfd9", - "message": "add latest tag for release docker image (#279)\n\nNeed to tag the latest for the release docker image\r\n\r\n---------\r\n\r\nCo-authored-by: dhuangnm ", - "timestamp": "2024-06-03T20:14:54Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/fdd9a3143946895e4cec17ba9db28937d48cdfd9" + "id": "0257d9df154c888e4e47cbbb7386dad7c3ef100f", + "message": "Update README.md", + "timestamp": "2024-06-04T18:22:13Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f" }, - "date": 1717486899331, + "date": 1717572964196, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5785.776880499725, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73755.18542200007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.92931541067568, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54781.31541236267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.1869515009821, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66663.55898900019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.67676156071734, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.39745048123163, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.66763363645848, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.54248457409597, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:24:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55960.890349500005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5274.142381000274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34505.53588642667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.30165391202172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34309.34696250006, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.8570250006669, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.76908291292905, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.36751358898898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.35385544190784, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.34859982848261, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:06:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6048.551416500004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6376.852755000016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.46771771000446, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 108.15523370000392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.17563349999546, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.50477400003274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.69668735925634, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.33136610131439, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.31330238613524, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.037428053801804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:23:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1927.273313000569, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6031.7559444999915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.58514570668073, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.05798475333054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.797591000507964, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.35389249989566, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.246371649076861, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.506726089476274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.561082803845053, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.345374117580604, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:24:01 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2554.296359000091, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1847.9448284997488, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.69095240934499, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.11867583997689, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.05938650028111, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.75746250012526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.02549530720225, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.720778409994828, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.146301608164475, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.957975231980813, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:30:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57331.99107350083, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6115.664882499914, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24318.87264188733, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.67819980002848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 22855.887930999415, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.5118680000869, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.4469279026636, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.6407318778397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.98281539278796, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.16146819897536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:05:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6119.579037499989, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5123.928123499354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.05635790664746, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.0871250799925, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.90187099979994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.15235899919935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.697206180365264, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.38484088215165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.21231925674887, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.957168229789495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:15:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6024.258447000079, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2053.7065545004225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.78459783999871, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.75048137999693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.81675449998238, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.944025999830046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.272888658516614, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.684640059364112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.24340105675399, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.797214167940783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:51:57 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12620.563315500476, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2412.450625499787, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 167.80186781333032, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.58843513332977, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.6815070008961, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.94715549966713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 113.07529442774695, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.97720822256229, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.00880712124115, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.207593782838163, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:30:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 250191.674357, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6051.014324999869, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 236125.77282334934, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.20469047333943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 236989.25359399983, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.93934050005191, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.64188447174384, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.5328849296462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.79395197603682, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.22912378618021, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:45:44 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1853.0404435000492, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11522.47239799999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.11207146999732, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 189.9282906139882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.19081950034888, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.82469449978453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.796614877419039, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.57810831842728, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.038254724045085, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.81603877024982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:58:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 74282.63615700007, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57308.29728949993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55073.640529414675, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24429.24578628534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67166.00807750001, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22884.999396499552, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.38270194958277, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.12438337329968, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.41100075899841, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.18140868332165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:31:16 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2391.326898999523, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12847.864813999877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.25296086932576, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.18938731532884, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.60975599962694, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.56327750002674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.76224480426048, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.64073281152855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.083277643203157, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.13423978338325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:04:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1966.413384000134, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5772.151614000904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.18339140666407, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.26393409336985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.758860000394634, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.76324100088095, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.235902758044592, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.595730911464535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.286106823696699, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.781826257653435, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7498.087665499952, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12629.358229999525, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.2915395866703, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 162.56727568198158, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 124.13391999996293, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.46049300060258, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.17845654502635, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 112.88163107184694, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.71056960190177, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.52381771349468, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:58:29 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6369.020522500023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55257.30873649991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.99698670667082, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34014.00603094267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.12162149999313, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 33341.29117949999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.287103541900656, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.60821701314372, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.95729965848386, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.78219893172137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 02:45:46 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11410.404378999829, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1968.1786680002915, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 188.55270861600667, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.12485840001439, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.18165550018603, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.845241499984695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.01591651072756, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.227909963066596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.16611659868101, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.32405219681264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:11:23 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12309.922719000042, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7460.377201500023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.7500242380041, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.0358747626657, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 154.75827799991748, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.64867150005466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.52570002857267, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.98506018080652, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.80388002495819, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.65100405095286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:36:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5111.47914150024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3654.1526315004376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.57914246011205, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.80341375336016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.33024400019349, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.23372050039325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.41309699864617, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.38280171831792, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.981834447155244, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.700130815657285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 05:16:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2054.349911999452, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1929.135523500463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.28242346670837, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.89699821668425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.542881499983196, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.17522600000302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.664829629106839, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.259917131621624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.797083416327892, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.53502009480749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-04 04:17:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3659.3875905009554, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2544.4823535003707, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.56452539669831, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.51974250535325, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.14418849963113, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.8322054999262, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.47520356026676, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.874461210297007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.816849227068797, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.96535923873111, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:47:17 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5258.380299500459, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 249719.7052050003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.02851416803605, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 235743.81317350536, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.04110500076786, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 236612.42044949994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.12063777152027, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.52977677315367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.4.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.32876665983879, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.70908300783768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.4.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-04 04:56:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -95590,668 +95590,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-04T18:22:13Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f" }, - "date": 1717572964196, + "date": 1717573182072, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73755.18542200007, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1975.7934499998555, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54781.31541236267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.33599586665939, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66663.55898900019, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.25566899941259, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.39745048123163, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.613003521163858, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.54248457409597, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.903863959951174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:26:04 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5274.142381000274, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1990.530694500194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.30165391202172, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.04017088661931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.8570250006669, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.069916500215186, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.36751358898898, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.352486622843355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.34859982848261, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.455696360151357, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:51:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6376.852755000016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65623.97806299999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 108.15523370000392, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42548.72657927333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.50477400003274, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42052.41047800007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.33136610131439, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.42581577642753, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.037428053801804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.73442258920154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:39:02 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6031.7559444999915, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5165.230333999716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.05798475333054, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.18472634663583, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.35389249989566, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.3192204998777, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.506726089476274, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.74831494379642, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.345374117580604, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.263469881805783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:45:13 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1847.9448284997488, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71233.34427600002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.11867583997689, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31836.63932091001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.75746250012526, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31021.99345149984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.720778409994828, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.19795882911964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.957975231980813, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.57356740026793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:52:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6115.664882499914, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6422.19940900003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.67819980002848, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.88173289333115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.5118680000869, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.78409450004119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.6407318778397, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.57983124137713, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.16146819897536, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.267313983479774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:10:46 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5123.928123499354, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6109.015791500269, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.0871250799925, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.68514260334541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.15235899919935, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.57903599994643, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.38484088215165, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.146640564830165, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.957168229789495, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.725523279960974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:11:42 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2053.7065545004225, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78529.7785214998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.75048137999693, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58721.69160256535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.944025999830046, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71508.66835149986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.684640059364112, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.09258718482668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.797214167940783, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.70630902656838, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:11:51 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2412.450625499787, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6104.897983000001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.58843513332977, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.89376464333085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.94715549966713, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.76336650003213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.97720822256229, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.06915336348656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.207593782838163, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.874942475419786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:58:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6051.014324999869, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 260933.7399484998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.20469047333943, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246348.652753672, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.93934050005191, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 247753.58967099988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.5328849296462, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.52547558982853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.22912378618021, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.37765993345238, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:17:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11522.47239799999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2068.365681999694, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 189.9282906139882, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.17974005337844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.82469449978453, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.95567000013034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.57810831842728, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.835997486112298, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.81603877024982, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.943342617564616, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:05:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57308.29728949993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18863.58120349996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24429.24578628534, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1483.9681253440042, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 22884.999396499552, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 265.1577640008327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.12438337329968, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.88222256506052, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.18140868332165, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.82300105146194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:00:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12847.864813999877, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6040.10625599949, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.18938731532884, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.45821917999032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.56327750002674, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.84418099994218, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.64073281152855, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.64609627110425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.13423978338325, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.86997792578394, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:30:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5772.151614000904, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2505.3557870000986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.26393409336985, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.1015805199798, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.76324100088095, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.43590049985505, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.595730911464535, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.655124752041274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.781826257653435, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.882650920331617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:18:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12629.358229999525, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15549.75253199973, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 162.56727568198158, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 212.969658120659, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.46049300060258, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 168.86559050044525, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 112.88163107184694, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.20656284526063, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.52381771349468, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.60245925788612, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:26:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55257.30873649991, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5455.755123000927, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34014.00603094267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 184.0125064293631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 33341.29117949999, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.26980849954998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.60821701314372, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.34378620451925, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.78219893172137, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.65310148491302, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:59:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1968.1786680002915, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2630.551425500016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.12485840001439, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.0175972480065, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.845241499984695, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.42846149980687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.227909963066596, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.839985249298476, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.32405219681264, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.839909343441953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:46:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7460.377201500023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3697.770496000885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.0358747626657, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.23739775337404, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.64867150005466, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.34760749920679, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.98506018080652, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.870631533442317, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.65100405095286, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.040367541896167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:51:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3654.1526315004376, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1896.827923500041, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.80341375336016, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.33098896669617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.23372050039325, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.04740800001673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.38280171831792, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.077754018439807, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.700130815657285, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.319771208280605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1929.135523500463, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6178.988775500102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.89699821668425, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.24486398669069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.17522600000302, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.42057300013948, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.259917131621624, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.014338630567, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.53502009480749, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.53110167117665, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:17:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2544.4823535003707, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17629.28239049961, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.51974250535325, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 228.15737318399079, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 82.8322054999262, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.52132100017116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.874461210297007, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.45657947859132, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.96535923873111, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 152.18902241546186, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:23:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 249719.7052050003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 8023.916034499962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 235743.81317350536, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.39932492533202, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 236612.42044949994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.8959225000308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.52977677315367, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.86249245694424, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.70908300783768, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.52022299866027, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:40:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -96272,668 +96272,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-04T18:22:13Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f" }, - "date": 1717573182072, + "date": 1717573292382, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1975.7934499998555, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1997.095519499453, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.33599586665939, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.49410707335119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.25566899941259, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.92237950044364, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.613003521163858, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.423041893780688, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.903863959951174, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.522508858599581, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1990.530694500194, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3708.8484034998146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.04017088661931, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.70396740337796, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.069916500215186, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 96.86096099994757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.352486622843355, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.751408917969854, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.455696360151357, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.16263272998981, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65623.97806299999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70348.65259299931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42548.72657927333, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31540.73703901402, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42052.41047800007, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30937.1006654992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.42581577642753, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.07305639892365, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.73442258920154, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.23880160253765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:45 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5165.230333999716, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6097.933693500181, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.18472634663583, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.86066675333373, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.3192204998777, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.7730484999829, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.74831494379642, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.20517006487294, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.263469881805783, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.686507297140324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:14:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71233.34427600002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6452.393932000006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31836.63932091001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.7267989666669, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31021.99345149984, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.27968650004368, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.19795882911964, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.68642417315979, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.57356740026793, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.35562801671746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6422.19940900003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6124.660605999907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.88173289333115, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.36626168998959, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.78409450004119, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.43713399992703, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.57983124137713, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.95552879690462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.267313983479774, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.76947833644865, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:37 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6109.015791500269, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1887.9630804999579, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.68514260334541, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.94640251327655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.57903599994643, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.55626250008572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.146640564830165, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.012353670994822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.725523279960974, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.249874644191436, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78529.7785214998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79988.87884600004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58721.69160256535, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59738.638552652, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71508.66835149986, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73359.15299499992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.09258718482668, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.03593499119127, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.70630902656838, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.07576343076235, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:23 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6104.897983000001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6063.049027000488, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.89376464333085, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.42886400928546, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.76336650003213, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.7157725003126, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.06915336348656, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.8214165161257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.874942475419786, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.03518404416397, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 260933.7399484998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5198.608692498965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246348.652753672, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.76383455999651, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 247753.58967099988, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.78620149987546, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.52547558982853, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.83553935022538, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.37765993345238, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.3345803907413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2068.365681999694, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1966.553798500172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.17974005337844, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.29166702000414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.95567000013034, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.26092300048913, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.835997486112298, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.558316077371831, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.943342617564616, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.834920917726885, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18863.58120349996, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19222.069294000903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1483.9681253440042, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1812.5878313566814, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 265.1577640008327, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 295.03822549941106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.88222256506052, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.2759532426503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.82300105146194, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.19713947191278, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:29:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6040.10625599949, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259086.0955725002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.45821917999032, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244112.833214076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.84418099994218, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245980.72280350016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.64609627110425, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.04524546886341, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.86997792578394, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.15701582574867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2505.3557870000986, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2071.589972500533, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.1015805199798, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.63230001999182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.43590049985505, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.06669199973112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.655124752041274, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.828980113139927, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.882650920331617, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.936821029889687, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15549.75253199973, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2636.5531265000755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 212.969658120659, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.24517320798866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 168.86559050044525, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.84799849959745, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.20656284526063, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.85569717316406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.60245925788612, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.877478772904112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5455.755123000927, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5440.578917499806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 184.0125064293631, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.5354573026731, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.26980849954998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.29180650044873, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.34378620451925, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.02768010298062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.65310148491302, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.25011724984416, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:54:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2630.551425500016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17301.371945000028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.0175972480065, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 224.18166553200354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.42846149980687, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.0413339999286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.839985249298476, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.13134217250737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.839909343441953, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.33623283996045, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:18 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3697.770496000885, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2525.048247500308, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.23739775337404, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.72952551065949, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.34760749920679, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.95160250013578, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.870631533442317, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.661041977349463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.040367541896167, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.853015171401005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:46:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1896.827923500041, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15602.865738499986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.33098896669617, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 210.0430408546581, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.04740800001673, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.34792499994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.077754018439807, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.3320489613039, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.319771208280605, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.8927493348655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6178.988775500102, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 8026.447241000028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.24486398669069, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.6388383813328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.42057300013948, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.0641964999395, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.014338630567, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.9432166560783, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.53110167117665, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.63410415277631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:54 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17629.28239049961, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6170.664217999956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 228.15737318399079, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.15708289333816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.52132100017116, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.19553099990117, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 146.45657947859132, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.07245940430774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 152.18902241546186, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.50399300944225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:11 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 8023.916034499962, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65001.06689899997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.39932492533202, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42183.08303530666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.8959225000308, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42266.78036499993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.86249245694424, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.48058928235726, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.52022299866027, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.91195303058437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:53:25 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -96954,1350 +96954,1350 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-04T18:22:13Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f" }, - "date": 1717573292382, + "date": 1717573342462, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1997.095519499453, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77863.63307100009, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.49410707335119, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58281.66058818401, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.92237950044364, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71254.30481450008, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.423041893780688, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.18448559680624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.522508858599581, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.57614076542656, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3708.8484034998146, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6169.880697000053, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.70396740337796, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.46133026002947, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 96.86096099994757, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.49139850007668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.751408917969854, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.919925371456834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.16263272998981, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.40104160368872, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:43:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70348.65259299931, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16823.780649499895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31540.73703901402, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 222.3978496020136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30937.1006654992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.9299864995919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.07305639892365, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.8074422525249, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.23880160253765, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.19356786475004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:03:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6097.933693500181, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17811.0223725007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.86066675333373, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1061.8634171593512, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.7730484999829, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.86593749970052, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.20517006487294, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.41090960013378, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.686507297140324, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.09971559737025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:19:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6452.393932000006, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3699.151089499537, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.7267989666669, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.86275332994893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.27968650004368, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.76811899983295, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.68642417315979, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.791819456090682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.35562801671746, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.067366447461946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:40:03 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6124.660605999907, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6002.976175000185, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.36626168998959, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.60854235867495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.43713399992703, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.63529949960503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.95552879690462, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.369098626418904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.76947833644865, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.3331707284874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:46:16 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1887.9630804999579, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2522.696570500102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.94640251327655, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.05474123196716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.55626250008572, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.29174350031099, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.012353670994822, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.53805063470437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.249874644191436, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.71867678207576, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:54:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79988.87884600004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5440.339908500391, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59738.638552652, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.3716414972999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73359.15299499992, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 170.48778099979245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.03593499119127, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.94271573698874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.07576343076235, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.24281609001152, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:27:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6063.049027000488, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6110.254696499965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.42886400928546, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.08200832666654, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.7157725003126, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.9221575001311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.8214165161257, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.88085330292909, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 48.03518404416397, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.81224940285663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5198.608692498965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5154.448082000272, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.76383455999651, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.23465427997144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.78620149987546, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.30725599996367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.83553935022538, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.775983947753744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.3345803907413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.273593288596942, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:13:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1966.553798500172, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62147.10263950019, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.29166702000414, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39766.26892405266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.26092300048913, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39408.58177550001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.558316077371831, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.89464387296319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.834920917726885, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.30746616648189, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:20:14 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19222.069294000903, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6096.493981000094, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1812.5878313566814, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.65361824999916, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 295.03822549941106, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.98213850020875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.2759532426503, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.331489254710824, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.19713947191278, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.851920693577306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:28:21 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259086.0955725002, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7975.029886000016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244112.833214076, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.9884472533413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245980.72280350016, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 135.43239950001862, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.04524546886341, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.62101486594874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.15701582574867, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.21389066095032, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:42:10 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2071.589972500533, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1992.651213499812, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.63230001999182, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.69765539995569, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.06669199973112, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.1947960001271, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.828980113139927, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.402564449955385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.936821029889687, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.479954827523413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:13:57 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2636.5531265000755, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69047.98245550046, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.24517320798866, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30681.225544506644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.84799849959745, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29819.8609330002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.85569717316406, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.35947444170938, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.877478772904112, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 201.84431503164717, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:26:34 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5440.578917499806, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2616.2008219998825, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.5354573026731, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.6278214760144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.29180650044873, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.79476649972639, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.02768010298062, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.69699261167778, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.25011724984416, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.6767269947816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:52:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17301.371945000028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1964.6460755002408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 224.18166553200354, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.28143891663603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.0413339999286, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.55193900010636, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.13134217250737, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.63106773922918, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.33623283996045, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.841530274834774, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:33:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2525.048247500308, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2074.2727599999853, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.72952551065949, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.70022990000386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.95160250013578, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.03294449987516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.661041977349463, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.826978801172709, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.853015171401005, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.95713960298148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:00:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15602.865738499986, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258395.27599550024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 210.0430408546581, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243924.9034436213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.34792499994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245393.95490599985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.3320489613039, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.57896753179935, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.8927493348655, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.31930061260277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:07:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 8026.447241000028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6430.6718875000115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.6388383813328, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.98738446666934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.0641964999395, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.75269950000984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.9432166560783, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.660171282886196, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, - { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.63410415277631, + { + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.35048996213381, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:52:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6170.664217999956, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14931.094657999893, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.15708289333816, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 208.00185868199515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.19553099990117, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 169.5139149996976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.07245940430774, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.07384939298134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.50399300944225, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.14083376761975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:11:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65001.06689899997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1884.3689440000162, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42183.08303530666, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.87971182333058, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42266.78036499993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.980469000540324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.48058928235726, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.001923244417402, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.91195303058437, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.251956720453101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:01:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "Michael Goin", - "username": "mgoin", - "email": "michael@neuralmagic.com" + "name": "dhuangnm", + "username": "dhuangnm", + "email": "74931910+dhuangnm@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "0257d9df154c888e4e47cbbb7386dad7c3ef100f", - "message": "Update README.md", - "timestamp": "2024-06-04T18:22:13Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/0257d9df154c888e4e47cbbb7386dad7c3ef100f" + "id": "367c5ee80cc75f5d5b6af72de5e1e5e463e386f7", + "message": "strip binaries (#283)\n\nCo-authored-by: dhuangnm ", + "timestamp": "2024-06-05T21:03:26Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7" }, - "date": 1717573342462, + "date": 1717659311471, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77863.63307100009, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2071.795236000071, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58281.66058818401, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.62129434664408, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71254.30481450008, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.27264349997495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.18448559680624, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.834207957060189, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.57614076542656, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.929855224038599, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:29:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6169.880697000053, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258306.14780649988, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.46133026002947, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243799.61443960338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.49139850007668, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245117.86312399953, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.919925371456834, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.64041041772076, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.40104160368872, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.2662787212779, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:14:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16823.780649499895, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7922.697949500048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 222.3978496020136, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.31748106933415, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.9299864995919, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.85172500007502, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.8074422525249, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.316413436361834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.19356786475004, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.95898594036765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:35:23 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17811.0223725007, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6151.938715500023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1061.8634171593512, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.07350583665477, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.86593749970052, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.0308049999976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.41090960013378, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.024544313732136, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.09971559737025, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.782349249063266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:31:45 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3699.151089499537, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68037.37022549921, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.86275332994893, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30100.42021927734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.76811899983295, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 28906.142724000347, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.791819456090682, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 190.25804743862463, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.067366447461946, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 201.1107669688994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:48:25 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6002.976175000185, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5146.531957000661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.60854235867495, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.62724450000678, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.63529949960503, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.349385500150674, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.369098626418904, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.724407216546886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.3331707284874, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.250674101384273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:24:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2522.696570500102, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6213.317461499855, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.05474123196716, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.93692238665668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.29174350031099, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.11752150006578, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.53805063470437, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.909500825725175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.71867678207576, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.38810673453767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:02:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5440.339908500391, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6107.461079999894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.3716414972999, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.31564741000209, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 170.48778099979245, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.53915549999147, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.94271573698874, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.25123814329617, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.24281609001152, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.85368992077919, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 04:56:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6110.254696499965, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1993.1376780000392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.08200832666654, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.2969698333424, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.9221575001311, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.129797999677976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.88085330292909, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.455911070524213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.81224940285663, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.530717232272059, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:49:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5154.448082000272, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16751.494377999734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.23465427997144, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 219.94728134001465, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.30725599996367, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.75347900017368, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.775983947753744, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 137.40456832042594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.273593288596942, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.76655642728957, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:17:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62147.10263950019, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17889.05048000015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39766.26892405266, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1083.3993673066313, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39408.58177550001, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 229.99817649906618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.89464387296319, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.99140842961486, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.30746616648189, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.93596860500983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:04:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6096.493981000094, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2619.253929000024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.65361824999916, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.25586048800324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.98213850020875, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.6595824998767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.331489254710824, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.589928769144386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.851920693577306, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.50490839269328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:21:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7975.029886000016, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80558.27551649987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.9884472533413, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60218.67284408668, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 135.43239950001862, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 74254.7567985, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.62101486594874, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.85018591317444, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.21389066095032, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.08448521974842, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1992.651213499812, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63741.531855999936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.69765539995569, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41159.210393221336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.1947960001271, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40212.179885499834, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.402564449955385, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 99.67944955550327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.479954827523413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.0865461263572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:50:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69047.98245550046, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15127.325313500023, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30681.225544506644, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 210.2396104793285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29819.8609330002, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 168.13532199967085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.35947444170938, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.39129914192421, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 201.84431503164717, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.2692568067652, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-05 05:05:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2616.2008219998825, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1895.625746499718, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.6278214760144, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.22735414663475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.79476649972639, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.00738999960231, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.69699261167778, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.011212652119212, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.6767269947816, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.276344271528881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:28:36 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1964.6460755002408, + "value": 1950.5479389999891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.28143891663603, + "value": 94.45890538333211, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.55193900010636, + "value": 59.743625999999495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.63106773922918, + "value": 13.461409176814545, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.841530274834774, + "value": 12.74895741758177, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:22:17 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2074.2727599999853, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6046.4773440007775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.70022990000386, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.64030966269152, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.03294449987516, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.55446850066073, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.826978801172709, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.628013159921274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.95713960298148, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.59649299788427, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:16:02 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258395.27599550024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3714.1871910007467, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243924.9034436213, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.33834539669624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245393.95490599985, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.42250300071464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.57896753179935, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.83992388186532, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.31930061260277, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.07884507100644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:44:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6430.6718875000115, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2505.978134999623, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.98738446666934, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.93389008799689, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.75269950000984, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.6131885003415, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.660171282886196, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.713543749562035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.35048996213381, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.84215263645413, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 02:43:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14931.094657999893, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5536.584911499631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 208.00185868199515, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.84844596668216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 169.5139149996976, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.18272499980958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.07384939298134, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.93757760724695, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.14083376761975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.113197757742746, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 04:09:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1884.3689440000162, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6422.859499999959, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.87971182333058, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.53755632000184, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.980469000540324, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.94469050002772, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.001923244417402, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.634538236675354, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.251956720453101, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.28795376571324, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-05 03:56:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -98318,668 +98318,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-05T21:03:26Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7" }, - "date": 1717659311471, + "date": 1717659337293, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2071.795236000071, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7561.334372500028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.62129434664408, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.53552857467307, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.27264349997495, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.37896199998522, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.834207957060189, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56.343515332182015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.929855224038599, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.79649200897705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258306.14780649988, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5124.215013000139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243799.61443960338, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.17597861989634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245117.86312399953, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.99882400010392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.64041041772076, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.439815983475754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.2662787212779, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.997657132015405, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:02 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7922.697949500048, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1853.9279535002606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.31748106933415, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.09767266334832, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.85172500007502, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.97557999994751, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.316413436361834, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.752838098566457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.95898594036765, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.98620253284903, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6151.938715500023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1971.939440999904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.07350583665477, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 77.3307800733528, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.0308049999976, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.88041450025048, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.024544313732136, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.245637929339898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.782349249063266, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.335751873328213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:35 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68037.37022549921, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6017.480400000068, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30100.42021927734, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.19127569333918, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 28906.142724000347, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.02784850008993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 190.25804743862463, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.38201789582426, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 201.1107669688994, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.42837691540775, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:06 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5146.531957000661, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 74014.74871899995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.62724450000678, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54848.37464709335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.349385500150674, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67050.76776550004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.724407216546886, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.75695212437148, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.250674101384273, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.18998813923736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6213.317461499855, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2056.14391600011, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.93692238665668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.60492640665811, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.11752150006578, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.21059399999649, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.909500825725175, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.707244972260469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.38810673453767, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.822381328736244, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6107.461079999894, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12958.427279500029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.31564741000209, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.8325882873281, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.53915549999147, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 159.24496200022986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.25123814329617, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.14719103000134, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.85368992077919, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.7680000369014, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:58 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1993.1376780000392, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12817.44234550024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.2969698333424, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 162.61203520401736, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.129797999677976, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.56327699925896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.455911070524213, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.50368467609007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.530717232272059, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.96671823434735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16751.494377999734, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1927.0397564996529, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 219.94728134001465, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.1372103999826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.75347900017368, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.00312999963353, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 137.40456832042594, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.281667023213473, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.76655642728957, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.566237753947677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:33 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17889.05048000015, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6131.616917499969, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1083.3993673066313, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.14621484665986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 229.99817649906618, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.46932699986974, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.99140842961486, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.78245623232327, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.93596860500983, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.30715814795518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:23:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2619.253929000024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2544.320060500013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.25586048800324, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.19201419733629, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.6595824998767, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.03659450004852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.589928769144386, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.966716106169642, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.50490839269328, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.082941231904826, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:45 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80558.27551649987, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6380.9977025000535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60218.67284408668, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 109.08876862667437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 74254.7567985, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.93255199997839, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.85018591317444, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.34462272523681, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.08448521974842, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.037268217760975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63741.531855999936, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 56907.804171500175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41159.210393221336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23996.813235266003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40212.179885499834, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22443.69704499968, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 99.67944955550327, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.794710544754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.0865461263572, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.16674524366823, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15127.325313500023, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11652.233324999543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 210.2396104793285, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 188.99047598733705, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 168.13532199967085, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.68642900025952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.39129914192421, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.104427365891, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.2692568067652, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.61193503831414, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1895.625746499718, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3667.2081065003113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.22735414663475, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.1880004566877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.00738999960231, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 93.01893300016673, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.011212652119212, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.207180873330188, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.276344271528881, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23.781346234008886, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:31 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1950.5479389999891, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5253.7163139995755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.45890538333211, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 181.92108841200388, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.743625999999495, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 176.7236440000488, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.461409176814545, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.95742807844661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.74895741758177, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.21478237179607, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6046.4773440007775, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55046.67195750005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.64030966269152, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34080.341097640005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.55446850066073, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 33934.031472500006, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.628013159921274, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.86181144956682, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.59649299788427, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 98.16332444601062, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3714.1871910007467, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 251508.07848850012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.33834539669624, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 237415.37543954464, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.42250300071464, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 238630.57078349972, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.83992388186532, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.42414724961917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.07884507100644, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.95421583107844, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2505.978134999623, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5744.0356564993635, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.93389008799689, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.82568461998986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.6131885003415, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 86.97315400058869, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.713543749562035, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.39422652859815, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.84215263645413, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 45.508422680135055, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5536.584911499631, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6057.473102499898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.84844596668216, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.94549031666763, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 174.18272499980958, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.69400949986994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.93757760724695, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.587095191553765, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.113197757742746, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.28696862664273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6422.859499999959, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2406.3182155000504, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.53755632000184, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113.72653467465352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.94469050002772, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.61923700031548, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.634538236675354, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.890889228093677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.28795376571324, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.174985396382663, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -99000,668 +99000,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-05T21:03:26Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7" }, - "date": 1717659337293, + "date": 1717659387678, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7561.334372500028, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7988.147077999997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.53552857467307, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.11015766800696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.37896199998522, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.92036299997744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56.343515332182015, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.92950447171786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55.79649200897705, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.69751428298563, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:53:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5124.215013000139, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2515.5910209996364, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.17597861989634, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.1154712280028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.99882400010392, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.41899100044247, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.439815983475754, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.64229827587129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30.997657132015405, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.854738917947568, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:10:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1853.9279535002606, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259713.36875049997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.09767266334832, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244671.78918956334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.97557999994751, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246374.108513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.752838098566457, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.70171740964336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.98620253284903, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.33495090392691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:53:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1971.939440999904, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6103.2862680000335, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 77.3307800733528, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.34661319999742, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.88041450025048, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.0437935000109, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.245637929339898, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.93754922583001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.335751873328213, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.85879893618455, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:47:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6017.480400000068, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3708.6690379992433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.19127569333918, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.79501401664675, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.02784850008993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.89751599994634, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.38201789582426, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.885508558223382, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.42837691540775, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.157567697125923, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 74014.74871899995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5486.096011500194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54848.37464709335, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.72797567327993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67050.76776550004, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.86483949940157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.75695212437148, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.35778788926874, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.18998813923736, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.40124741742852, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:26:37 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2056.14391600011, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17455.734022499655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.60492640665811, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 224.04956231399652, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.21059399999649, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 179.17058600005475, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.707244972260469, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.4412121768222, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.822381328736244, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148.41204767122895, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:12:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12958.427279500029, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5158.379417000106, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.8325882873281, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.40565544002311, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 159.24496200022986, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.71433749952848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.14719103000134, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.766268181055466, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.7680000369014, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.264735818489633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:31:29 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12817.44234550024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6169.043487500176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 162.61203520401736, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.78861056664633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.56327699925896, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.47407099990232, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.50368467609007, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.03913815392624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.96671823434735, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.463414853667054, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:25:18 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1927.0397564996529, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6040.882092000174, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.1372103999826, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.98810139734755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.00312999963353, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.44424650012661, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.281667023213473, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.86818574100186, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.566237753947677, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.89272645072441, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:18:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6131.616917499969, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6421.205358500003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.14621484665986, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.06596085332755, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.46932699986974, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.34748050005146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.78245623232327, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.661673580311565, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.30715814795518, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.346143724572, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:11:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2544.320060500013, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79851.06389799989, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.19201419733629, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60057.25443040801, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.03659450004852, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73214.05616899983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.966716106169642, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.4950842808281, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.082941231904826, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.38619024632074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:24:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6380.9977025000535, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1879.0043674998742, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 109.08876862667437, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.75263231003555, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.93255199997839, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.382567999819, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.34462272523681, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.956603668530867, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.037268217760975, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.210659433745182, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:12 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 56907.804171500175, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2067.1192844997677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23996.813235266003, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.71093437332213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 22443.69704499968, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.59328949956398, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.794710544754, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.826639051328756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 192.16674524366823, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.91504162451401, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:00:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11652.233324999543, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1995.768789000067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 188.99047598733705, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.88001226663012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 147.68642900025952, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.83630199966137, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.104427365891, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.412102007079442, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.61193503831414, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.484164304969804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:06:00 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3667.2081065003113, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6115.007779500047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.1880004566877, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.25073971667614, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 93.01893300016673, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.17800850009371, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.207180873330188, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.21653402766049, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 23.781346234008886, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.858037121296086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:41:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5253.7163139995755, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15728.803737999897, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 181.92108841200388, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 209.45778847200685, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 176.7236440000488, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.12318399995274, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.95742807844661, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.92452536060189, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.21478237179607, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.71982527368462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:50:30 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 55046.67195750005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2631.9194974998936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34080.341097640005, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.30705779998243, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 33934.031472500006, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.10202550021495, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.86181144956682, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.851983777178372, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 98.16332444601062, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.862149778200866, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:01:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 251508.07848850012, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18910.703681000086, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 237415.37543954464, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1460.6714377166754, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 238630.57078349972, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 265.01184350036056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.42414724961917, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.63675098098742, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65.95421583107844, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.87901538779113, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:41:07 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5744.0356564993635, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64955.77952849999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.82568461998986, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42256.55529801801, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 86.97315400058869, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41436.405920000085, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 52.39422652859815, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.85516992508515, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 45.508422680135055, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.39974855038686, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:18:24 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6057.473102499898, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1959.9752239996633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.94549031666763, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.68746850665107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.69400949986994, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.128760000679904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.587095191553765, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.532258797718644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.28696862664273, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.824990888030028, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:18:25 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2406.3182155000504, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70450.58698949925, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 113.72653467465352, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31327.308545114716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.61923700031548, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30185.243406000154, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.890889228093677, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.92121175681964, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.174985396382663, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.58036901354376, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.11.4 (main, Jun 7 2023, 10:57:56) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:59:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -99682,1350 +99682,1350 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-05T21:03:26Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7" }, - "date": 1717659387678, + "date": 1717659700478, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7988.147077999997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5444.995677999941, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 158.11015766800696, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.43153263602773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.92036299997744, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.46796700076084, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.92950447171786, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.280124068436784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.69751428298563, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.37203757283457, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:47:56 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2515.5910209996364, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78778.1954510001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.1154712280028, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59162.487471686676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.41899100044247, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72037.40187199991, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.64229827587129, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.62343158062342, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.854738917947568, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.03294767678214, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:36 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259713.36875049997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1956.2861680001333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244671.78918956334, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.14779560665677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246374.108513, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.077937999518326, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.70171740964336, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.437431639448432, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.33495090392691, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.723519856814338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:36:06 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6103.2862680000335, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65373.15701099999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.34661319999742, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42277.50111589267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.0437935000109, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41742.67372500003, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.93754922583001, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.17157111138543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.85879893618455, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.43454124109734, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:41:21 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3708.6690379992433, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7956.53410649993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.79501401664675, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.04044950666443, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.89751599994634, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.30081100007374, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.885508558223382, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.715331306465714, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.157567697125923, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.409235365153606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:40:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5486.096011500194, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70516.79164049984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.72797567327993, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31495.424776235326, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.86483949940157, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30031.702047999715, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.35778788926874, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 193.22762823587541, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.40124741742852, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.57441203650677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:48:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17455.734022499655, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259011.95400250025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 224.04956231399652, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244055.78354314202, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 179.17058600005475, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245917.08374050018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.4412121768222, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.01394674477102, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 148.41204767122895, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.20731673968393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:50 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5158.379417000106, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2064.670552500047, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.40565544002311, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.15265723332655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.71433749952848, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42.677035499764315, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.766268181055466, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.785276597867119, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.264735818489633, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.899947824989589, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:09:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6169.043487500176, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18063.48664699999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.78861056664633, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 232.45559611600157, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.47407099990232, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.47827300001518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.03913815392624, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.0605816427107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.463414853667054, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 153.05545275452593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:05:48 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6040.882092000174, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3704.064616999858, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.98810139734755, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.43571817667907, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.44424650012661, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 97.69973150014266, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.86818574100186, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.87243340627162, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.89272645072441, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.099637962123776, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:16:41 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6421.205358500003, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6106.5911880000385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.06596085332755, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 122.00355583999226, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.34748050005146, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.82650199996533, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.661673580311565, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.20168109883727, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.346143724572, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.8089047256667, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:35:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79851.06389799989, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6035.800041500806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60057.25443040801, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.20267250667773, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73214.05616899983, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.79566750035883, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.4950842808281, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.44995107384843, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.38619024632074, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.61603700214803, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:21:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1879.0043674998742, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1889.1761065005994, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 91.75263231003555, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.37879568333179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.382567999819, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.173135000084585, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.956603668530867, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.024526196932513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.210659433745182, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.257026714855618, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2067.1192844997677, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5162.35110100024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.71093437332213, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.73062119988997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.59328949956398, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.62037499962025, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.826639051328756, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.810021177324366, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.91504162451401, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.314041988770864, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:52 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1995.768789000067, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15547.739225999976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.88001226663012, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 209.84569314132264, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.83630199966137, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 169.8408844995356, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.412102007079442, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 125.59957043409153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.484164304969804, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.1312142104596, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6115.007779500047, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6116.3640780000605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.25073971667614, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 129.47920948666857, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.17800850009371, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.83823349994691, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.21653402766049, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.796792863182176, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.858037121296086, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.748115254378284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:12:55 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15728.803737999897, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2619.209377000061, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 209.45778847200685, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.57118906267958, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.12318399995274, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.23856800006979, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.92452536060189, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.589286586639965, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 133.71982527368462, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.653235870041396, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:02:22 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2631.9194974998936, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2491.7438265001692, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.30705779998243, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114.97949669731922, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.10202550021495, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.77798499996425, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.851983777178372, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.55062783805633, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.862149778200866, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.703622409928943, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:20:58 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18910.703681000086, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6432.053577000033, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1460.6714377166754, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.57128104666981, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 265.01184350036056, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71.33699750005462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.63675098098742, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.59753883390603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.87901538779113, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.31719864656108, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:24:39 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64955.77952849999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1985.349613499693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42256.55529801801, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.35227234002257, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41436.405920000085, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.155220500106225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.85516992508515, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.33685106604449, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.39974855038686, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.407172921438526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:56:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1959.9752239996633, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6185.281503999931, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.68746850665107, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 106.85994023998623, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.128760000679904, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.0730799997491, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.532258797718644, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.97793281004091, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.824990888030028, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.46808400963367, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:14:53 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70450.58698949925, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18247.279312500723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31327.308545114716, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1292.66038330267, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30185.243406000154, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246.61586699949112, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.92121175681964, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.7642815834863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 203.58036901354376, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.93258284940498, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:58:32 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" } ] }, { "commit": { "author": { - "name": "dhuangnm", - "username": "dhuangnm", - "email": "74931910+dhuangnm@users.noreply.github.com" + "name": "Derek Kozikowski", + "username": "derekk-nm", + "email": "106621615+derekk-nm@users.noreply.github.com" }, "committer": { "name": "GitHub", "username": "web-flow", "email": "noreply@github.com" }, - "id": "367c5ee80cc75f5d5b6af72de5e1e5e463e386f7", - "message": "strip binaries (#283)\n\nCo-authored-by: dhuangnm ", - "timestamp": "2024-06-05T21:03:26Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/367c5ee80cc75f5d5b6af72de5e1e5e463e386f7" + "id": "87571b8be8105738d6da87df053d5a32e7fa001e", + "message": "add more models, new num_logprobs (#285)\n\nadding the `microsoft/phi-2`, `google/gemma-1.1-2b-it`, and\r\n`HuggingFaceH4/zephyr-7b-gemma-v0.1` models to\r\ntest_basic_server_correctness.py. this required increasing the number of\r\nlogprobs included in the evaluation to avoid unexpected failure for a\r\nfew prompts with these models. this did not negatively impact the other\r\nmodels.\r\n\r\nran the test locally multiple times. each time we passed, like this:\r\n```\r\n/root/pyvenv/nmv3119a/bin/python3 /root/.local/share/JetBrains/IntelliJIdea2023.3/python/helpers/pycharm/_jb_pytest_runner.py --target test_basic_server_correctness.py::test_models_on_server -- --forked \r\nTesting started at 2:24 PM ...\r\nLaunching pytest with arguments --forked test_basic_server_correctness.py::test_models_on_server --no-header --no-summary -q in /network/derekk/testdev1/nm-vllm/tests/basic_correctness\r\n\r\n============================= test session starts ==============================\r\ncollecting ... collected 7 items\r\nRunning 7 items in this shard: tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None]\r\n\r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None] \r\n\r\n======================== 7 passed in 1332.51s (0:22:12) ========================\r\n```", + "timestamp": "2024-06-06T20:15:52Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e" }, - "date": 1717659700478, + "date": 1717745616435, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5444.995677999941, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14699.765167500118, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.43153263602773, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 211.30380943466784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.46796700076084, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.09230100004424, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.280124068436784, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 118.17419190265723, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.37203757283457, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 123.72379252928387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:54:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78778.1954510001, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6101.630609999802, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59162.487471686676, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.55697118334956, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 72037.40187199991, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.46602499985056, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.62343158062342, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.0817014139771, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.03294767678214, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.7675226272593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:28:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1956.2861680001333, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2071.710462000283, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.14779560665677, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.17708044670144, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.077937999518326, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.36620849981409, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.437431639448432, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.851890254539338, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.723519856814338, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.968660420963158, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:21:06 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65373.15701099999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7952.279740999984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42277.50111589267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.69975958266573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41742.67372500003, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 130.6232330000512, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.17157111138543, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.555887026381384, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.43454124109734, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.24977585960107, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:03:42 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7956.53410649993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1989.4785374999628, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.04044950666443, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.30192025334934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.30081100007374, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.08235500057344, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.715331306465714, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.371696815007207, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.409235365153606, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.450931988206621, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:55:23 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70516.79164049984, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 63511.5111409998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31495.424776235326, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41315.98948766067, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30031.702047999715, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40922.82402299998, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 193.22762823587541, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.2348996305846, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 204.57441203650677, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.47305210424202, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:04:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259011.95400250025, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16881.950900999982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244055.78354314202, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 227.5808386700167, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245917.08374050018, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 180.28589000005013, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.01394674477102, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 137.69965027748881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.20731673968393, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.0477701306793, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:43:27 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2064.670552500047, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68539.91656400103, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.15265723332655, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30583.081945974624, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42.677035499764315, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29791.70794899983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.785276597867119, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.3069806478081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.899947824989589, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.39804439303757, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:15:05 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18063.48664699999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6406.818620500018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 232.45559611600157, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110.17746072666644, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.47827300001518, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.06669449998526, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.0605816427107, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.55905916254394, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 153.05545275452593, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.23628466129036, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:34:04 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3704.064616999858, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79631.96407450005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.43571817667907, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59710.224629344004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 97.69973150014266, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73366.30043800005, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.87243340627162, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.95557728360805, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.099637962123776, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.4342788293216, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 04:47:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6106.5911880000385, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2621.152668999912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 122.00355583999226, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.64845341196997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.82650199996533, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 84.57594350011277, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.20168109883727, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.723464604925574, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.8089047256667, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.694372836787924, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:20:20 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6035.800041500806, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 258293.03068149966, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.20267250667773, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 243293.79929580932, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.79566750035883, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245236.71960799993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.44995107384843, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.82026946569562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.61603700214803, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.97497076864781, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:22:57 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1889.1761065005994, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3688.6902534997716, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.37879568333179, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.05877963339904, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 57.173135000084585, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.61334450093273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.024526196932513, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.643208871601594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.257026714855618, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.011863460333114, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:55:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5162.35110100024, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1972.461887499776, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.73062119988997, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 95.00300123664299, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.62037499962025, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.475705999782804, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.810021177324366, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.541939674034946, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.314041988770864, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.823523306634206, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:15:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15547.739225999976, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2517.0589059998747, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 209.84569314132264, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.37199964132863, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 169.8408844995356, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.5983624997898, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 125.59957043409153, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.680460189964677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.1312142104596, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.916389410917894, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:08:37 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6116.3640780000605, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5158.889067000018, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 129.47920948666857, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.69670423333446, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.83823349994691, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 62.149140499059286, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.796792863182176, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.788614128731986, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.748115254378284, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.294957475099334, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:48:49 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2619.209377000061, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5423.314423500415, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.57118906267958, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 183.49934367465175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 83.23856800006979, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.76279899942892, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.589286586639965, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.98082605136603, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.653235870041396, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.07007129864882, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:27:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2491.7438265001692, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6169.823153999914, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 114.97949669731922, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 105.81503897332975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79.77798499996425, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.29999150004551, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.55062783805633, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 34.925416817931676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.703622409928943, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.43782526824139, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 04:01:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6432.053577000033, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17779.014052000093, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.57128104666981, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 964.8561994160129, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71.33699750005462, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 216.97776799919666, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.59753883390603, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155.16468318669814, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.31719864656108, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142.75762987123997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 02:42:36 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1985.349613499693, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6098.303635500088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.35227234002257, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.7963707633406, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.155220500106225, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 87.85856650001733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.33685106604449, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.72981414162362, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.407172921438526, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.67089046480836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:49:52 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6185.281503999931, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6013.731897999605, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 106.85994023998623, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.57696941996983, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.0730799997491, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88.63940199898934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.97793281004091, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.343427984564606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.46808400963367, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.349752413306696, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-06 03:13:12 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18247.279312500723, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1882.2477829999116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1292.66038330267, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.09608332000546, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246.61586699949112, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.545114999968064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.7642815834863, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.970106767876448, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.93258284940498, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.26096215724809, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-06 05:30:38 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -101046,668 +101046,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-06T20:15:52Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e" }, - "date": 1717745616435, + "date": 1717746402374, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 14699.765167500118, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2517.7662810001493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 211.30380943466784, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.19161227997877, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 171.09230100004424, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.63697599982333, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 118.17419190265723, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.721679456002857, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 123.72379252928387, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.92413555893253, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:55 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6101.630609999802, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5158.582335999199, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.55697118334956, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.95263102667862, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.46602499985056, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 64.343397999437, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.0817014139771, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.75590584696172, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.7675226272593, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.293732228919737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:12:42 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2071.710462000283, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 259032.36060049993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.17708044670144, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 244612.28835521068, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.36620849981409, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246070.47978950004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.851890254539338, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.27743964452591, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.968660420963158, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.33746257589912, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:07:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7952.279740999984, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7973.015396499932, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.69975958266573, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.49508787200284, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 130.6232330000512, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.383733499978, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.555887026381384, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.95818823991842, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.24977585960107, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.70170960125752, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:51 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1989.4785374999628, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3715.0806455001657, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.30192025334934, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143.27519131335671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.08235500057344, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 90.99995199994737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.371696815007207, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25.068206341116543, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.450931988206621, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.23450235230215, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:42:12 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 63511.5111409998, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78549.1150764999, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41315.98948766067, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58930.692567949336, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40922.82402299998, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 71195.70304599984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 100.2348996305846, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.66825277654573, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 102.47305210424202, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.38126489244029, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:56:08 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16881.950900999982, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2075.8854539999447, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 227.5808386700167, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.4134203466371, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 180.28589000005013, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.12088950068937, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 137.69965027748881, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.946538959099508, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.0477701306793, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.962208010001273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68539.91656400103, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6127.476792999914, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30583.081945974624, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.78522272334249, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29791.70794899983, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.03906449976239, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.3069806478081, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.32860101464848, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.39804439303757, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.88441506334213, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:56:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6406.818620500018, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5473.464693999631, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 110.17746072666644, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.49806569066035, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.06669449998526, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.2819434990015, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.55905916254394, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.36208355074749, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.23628466129036, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.49202614536779, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:35:04 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 79631.96407450005, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6120.321745000069, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59710.224629344004, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 128.82724009000412, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 73366.30043800005, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.28278450002836, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.95557728360805, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.885607378811606, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.4342788293216, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.90211287091744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:21:05 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2621.152668999912, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17906.05553700061, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.64845341196997, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1134.9781032719814, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 84.57594350011277, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 242.94661450039712, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.723464604925574, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 156.13205289057584, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.694372836787924, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.35530311353594, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:24 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 258293.03068149966, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6063.1630145007875, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 243293.79929580932, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.01546864803822, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 245236.71960799993, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.85463899989554, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69.82026946569562, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.88936309995303, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.97497076864781, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.83838556979124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:47 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3688.6902534997716, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6211.771980999856, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.05877963339904, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.04621746668151, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.61334450093273, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.50385799993819, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.643208871601594, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.00779937451883, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.011863460333114, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.450362258075806, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:39:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1972.461887499776, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17628.689927000323, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 95.00300123664299, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 230.70867368067593, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 61.475705999782804, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 188.1116634999671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.541939674034946, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 145.20456577223268, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.823523306634206, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 149.8547744586407, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:21 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2517.0589059998747, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1895.8214969998153, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.37199964132863, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 92.05938659671423, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 81.5983624997898, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.38087849992735, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.680460189964677, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.053351842604936, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.916389410917894, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.341031605919355, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:14 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5158.889067000018, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6432.436080499997, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.69670423333446, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.54317384666001, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 62.149140499059286, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.22862650001116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.788614128731986, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.64588538272693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.294957475099334, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.32310866398179, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:07:53 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5423.314423500415, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15676.351509999677, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 183.49934367465175, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 214.77793233064767, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.76279899942892, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 173.0857054999433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.98082605136603, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 126.57796654302709, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.07007129864882, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 131.56042148320319, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:47:02 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6169.823153999914, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1975.4753754996273, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 105.81503897332975, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.6554390200193, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.29999150004551, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.03294850031671, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 34.925416817931676, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.590512873251088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.43782526824139, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.86248584167194, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:05:35 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17779.014052000093, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65101.35466450004, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 964.8561994160129, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42475.126850674664, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 216.97776799919666, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41829.31294600007, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 155.16468318669814, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 101.03935240576146, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 142.75762987123997, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 103.5619433051051, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:22:51 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6098.303635500088, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2632.2685649997766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 127.7963707633406, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.501228742641, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 87.85856650001733, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 85.52717499969731, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.72981414162362, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.762526092246535, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.67089046480836, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.859516402474487, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:41:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6013.731897999605, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69213.93870249995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.57696941996983, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30924.398834644016, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 88.63940199898934, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29796.208432499952, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.343427984564606, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 191.66613274661282, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.349752413306696, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 202.69734388710387, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:14:54 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1882.2477829999116, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1998.5766394997881, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.09608332000546, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 78.00553531333813, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.545114999968064, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.35294949992385, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.970106767876448, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.49801165393879, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.26096215724809, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.525099976594081, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.10.12 (main, Jun 7 2023, 13:38:31) [GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }, @@ -101728,668 +101728,668 @@ window.BENCHMARK_DATA = { "timestamp": "2024-06-06T20:15:52Z", "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e" }, - "date": 1717746402374, + "date": 1717746526012, "tool": "customSmallerIsBetter", "benches": [ { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2517.7662810001493, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6421.003128999984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 115.19161227997877, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 111.42623192666709, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.63697599982333, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.67188750004516, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 18.721679456002857, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.67433006480225, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 16.92413555893253, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 37.347861589580766, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:06:44 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5158.582335999199, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70343.46215499955, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.95263102667862, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31386.580312674017, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 64.343397999437, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30286.047331999725, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.75590584696172, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 192.48230756539462, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 31.293732228919737, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 204.17271244945383, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:21:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 259032.36060049993, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7974.283517999993, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 244612.28835521068, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.71680846665973, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 246070.47978950004, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 132.23167049989115, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.27743964452591, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59.148484691931976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 67.33746257589912, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.772616362214926, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:48:15 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 7973.015396499932, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1878.2174769999074, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 157.49508787200284, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.77253980332655, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 132.383733499978, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 60.431901500123786, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.95818823991842, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.972919783518263, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58.70170960125752, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.254923318192837, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:00:05 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 3715.0806455001657, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15800.989984999433, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 143.27519131335671, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 212.69218189532452, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 90.99995199994737, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.57445400016513, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 25.068206341116543, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.97760569107744, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 24.23450235230215, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 133.28244030676518, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:52:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78549.1150764999, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6190.521204500101, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 58930.692567949336, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 104.20235481998739, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 71195.70304599984, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65.80370149981718, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 68.66825277654573, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.01656560674975, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 66.38126489244029, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35.58056303142888, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:33:28 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2075.8854539999447, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 260384.93473999962, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 80.4134203466371, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245451.56963145733, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.12088950068937, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 246618.5867190002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.946538959099508, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 70.43640871225261, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.962208010001273, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67.46840009518934, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:19:56 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6127.476792999914, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 65326.86430050012, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 121.78522272334249, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42519.619072352, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.03906449976239, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 42629.212142499906, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.32860101464848, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 100.75692873844417, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 39.88441506334213, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 102.79274229096899, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:25:03 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 5473.464693999631, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5447.402925999995, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 182.49806569066035, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 182.44597291333534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 172.2819434990015, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 172.4453029992219, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.36208355074749, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.31002367006393, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 38.49202614536779, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.53592806996797, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:00:16 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6120.321745000069, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2636.8615165001756, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 128.82724009000412, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.61561626666784, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.28278450002836, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83.31963150021693, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.885607378811606, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19.77806716244339, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 40.90211287091744, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17.863477117162816, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:53:30 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17906.05553700061, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1964.9785370002064, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1134.9781032719814, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 94.92590485000619, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 242.94661450039712, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.846054999932676, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 156.13205289057584, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.529513843147392, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 144.35530311353594, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.801651558912493, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:36:09 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6063.1630145007875, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 17789.619821000088, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 119.01546864803822, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 223.7930222799987, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 89.85463899989554, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 178.4341494999353, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 54.88936309995303, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 147.45370180062469, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 47.83838556979124, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 152.08704081000582, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:28:11 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6211.771980999856, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 3706.348058501135, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 104.04621746668151, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 144.10143136341503, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.50385799993819, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 91.89893900111201, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.00779937451883, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.774852731533024, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 35.450362258075806, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.09421720982175, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:17:56 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17628.689927000323, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6124.95311549992, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 230.70867368067593, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 127.61545218332762, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 188.1116634999671, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.0930150000122, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 145.20456577223268, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.003689081342976, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 149.8547744586407, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.82435296499245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:39:25 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1895.8214969998153, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2077.677904000211, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 92.05938659671423, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.92628254669155, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 60.38087849992735, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41.31876200017359, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.053351842604936, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.923252010410245, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.341031605919355, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.962807159673819, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:00:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 6432.436080499997, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6066.111044500758, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 111.54317384666001, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 120.11073751601604, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 70.22862650001116, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.60528650004562, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 36.64588538272693, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 54.64651420804996, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 37.32310866398179, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 47.674594944977116, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:47:17 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 15676.351509999677, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6116.888871499896, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 214.77793233064767, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 121.58046511666726, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 173.0857054999433, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.99082649982847, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 126.57796654302709, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40.2163011519034, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 131.56042148320319, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.91395159878737, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:13:29 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1975.4753754996273, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79834.82763249982, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 94.6554390200193, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 59873.52952924534, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 59.03294850031671, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73565.9825685002, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 13.590512873251088, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.03438943049908, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 12.86248584167194, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 66.48010893843285, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:26:13 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 65101.35466450004, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 2537.462056500317, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 42475.126850674664, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 115.79581256398403, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41829.31294600007, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 80.16767150002124, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 101.03935240576146, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18.801487508640506, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 103.5619433051051, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.960467193772498, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:08:26 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 2632.2685649997766, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18870.926144501027, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 120.501228742641, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1503.0183703219845, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 85.52717499969731, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 265.5922145004297, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 19.762526092246535, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 157.79824274366626, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 17.859516402474487, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 146.09656217046984, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:32:32 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 69213.93870249995, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5155.054059499889, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 30924.398834644016, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 89.98122839994419, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 29796.208432499952, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 58.98962949959241, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 191.66613274661282, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 36.717080017877386, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 202.69734388710387, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 31.255065943148917, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:10:03 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 1998.5766394997881, + "name": "{\"name\": \"median_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 1996.6575409998768, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 78.00553531333813, + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 79.3403228799798, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 41.35294949992385, + "name": "{\"name\": \"median_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.85432399986166, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.49801165393879, + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.385212283854328, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" }, { - "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", - "value": 11.525099976594081, + "name": "{\"name\": \"median_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11.468586433802306, "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] }